{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5198, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.03936368, "balance_loss_mlp": 2.84994221, "epoch": 0.00019238168526356292, "flos": 470575609344.0, "grad_norm": 15.847607787273237, "language_loss": 2.91765308, "learning_rate": 0.0, "loss": 1.97528625, "num_input_tokens_seen": 67104, "router_z_loss_mlp": 10.859375, "step": 1, "time_per_iteration": 24.278199672698975 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02015882, "balance_loss_mlp": 1.26743817, "epoch": 0.00038476337052712584, "flos": 504556065792.0, "grad_norm": 26.39987998366427, "language_loss": 2.42349291, "learning_rate": 0.00013726078121135892, "loss": 2.44365168, "num_input_tokens_seen": 134080, "router_z_loss_mlp": 7.4765625, "step": 2, "time_per_iteration": 2.74550199508667 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02034476, "balance_loss_mlp": 1.28603244, "epoch": 0.0005771450557906887, "flos": 599161245696.0, "grad_norm": 23.46624299076427, "language_loss": 2.13354897, "learning_rate": 0.00021755319103969496, "loss": 2.15389395, "num_input_tokens_seen": 205152, "router_z_loss_mlp": 7.4765625, "step": 3, "time_per_iteration": 2.820986270904541 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02058399, "balance_loss_mlp": 1.29927421, "epoch": 0.0007695267410542517, "flos": 580405326336.0, "grad_norm": 3.493910581799846, "language_loss": 1.37129521, "learning_rate": 0.00027452156242271784, "loss": 1.3918792, "num_input_tokens_seen": 269664, "router_z_loss_mlp": 7.5859375, "step": 4, "time_per_iteration": 2.677243947982788 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02066247, "balance_loss_mlp": 1.30979228, "epoch": 0.0009619084263178145, "flos": 486116204544.0, "grad_norm": 0.8674817587168525, "language_loss": 1.33187473, "learning_rate": 0.0003187096642208417, "loss": 1.35253716, "num_input_tokens_seen": 338560, "router_z_loss_mlp": 7.55859375, "step": 5, "time_per_iteration": 2.6032657623291016 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02071583, "balance_loss_mlp": 1.31322157, "epoch": 0.0011542901115813775, "flos": 560028349440.0, "grad_norm": 2.033424387355904, "language_loss": 1.30649018, "learning_rate": 0.0003548139722510539, "loss": 1.32720602, "num_input_tokens_seen": 410112, "router_z_loss_mlp": 7.578125, "step": 6, "time_per_iteration": 2.6967170238494873 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02101369, "balance_loss_mlp": 1.33652186, "epoch": 0.0013466717968449403, "flos": 533966307840.0, "grad_norm": 0.7061194413900653, "language_loss": 1.22160292, "learning_rate": 0.00038533972973918044, "loss": 1.24261677, "num_input_tokens_seen": 477552, "router_z_loss_mlp": 7.64453125, "step": 7, "time_per_iteration": 2.7199785709381104 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02146806, "balance_loss_mlp": 1.36975181, "epoch": 0.0015390534821085034, "flos": 492295739904.0, "grad_norm": 0.35850971046258795, "language_loss": 1.17196155, "learning_rate": 0.0004117823436340768, "loss": 1.19342971, "num_input_tokens_seen": 549184, "router_z_loss_mlp": 7.76171875, "step": 8, "time_per_iteration": 2.6428823471069336 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02153063, "balance_loss_mlp": 1.36837983, "epoch": 0.0017314351673720662, "flos": 564676033536.0, "grad_norm": 0.22105321402960548, "language_loss": 1.2430563, "learning_rate": 0.00043510638207938993, "loss": 1.26458693, "num_input_tokens_seen": 622880, "router_z_loss_mlp": 7.8359375, "step": 9, "time_per_iteration": 2.7773404121398926 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02194678, "balance_loss_mlp": 1.4077065, "epoch": 0.001923816852635629, "flos": 593406798336.0, "grad_norm": 0.2650641779955913, "language_loss": 1.13927829, "learning_rate": 0.00045597044543220066, "loss": 1.16122508, "num_input_tokens_seen": 693584, "router_z_loss_mlp": 7.87109375, "step": 10, "time_per_iteration": 2.6966803073883057 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02215625, "balance_loss_mlp": 1.42216802, "epoch": 0.002116198537899192, "flos": 609625046016.0, "grad_norm": 0.17099192662038445, "language_loss": 1.11761594, "learning_rate": 0.00047484428652143135, "loss": 1.13977218, "num_input_tokens_seen": 774432, "router_z_loss_mlp": 7.921875, "step": 11, "time_per_iteration": 2.846426010131836 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02218955, "balance_loss_mlp": 1.42854977, "epoch": 0.002308580223162755, "flos": 545129409024.0, "grad_norm": 0.11899482154082718, "language_loss": 1.17641664, "learning_rate": 0.0004920747534624128, "loss": 1.19860613, "num_input_tokens_seen": 844304, "router_z_loss_mlp": 7.890625, "step": 12, "time_per_iteration": 2.605074882507324 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02207543, "balance_loss_mlp": 1.41751897, "epoch": 0.002500961908426318, "flos": 644750461440.0, "grad_norm": 0.14172497717456267, "language_loss": 1.20158505, "learning_rate": 0.0005079252465375872, "loss": 1.22366059, "num_input_tokens_seen": 915104, "router_z_loss_mlp": 7.8984375, "step": 13, "time_per_iteration": 2.7560088634490967 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02203989, "balance_loss_mlp": 1.41625452, "epoch": 0.0026933435936898806, "flos": 487853259264.0, "grad_norm": 0.1448362910448976, "language_loss": 1.09927368, "learning_rate": 0.0005226005109505393, "loss": 1.12131357, "num_input_tokens_seen": 982720, "router_z_loss_mlp": 7.859375, "step": 14, "time_per_iteration": 2.623379707336426 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02125464, "balance_loss_mlp": 1.36481309, "epoch": 0.0028857252789534437, "flos": 434599644672.0, "grad_norm": 0.13392565488521943, "language_loss": 1.15514731, "learning_rate": 0.0005362628552605367, "loss": 1.17640197, "num_input_tokens_seen": 1050528, "router_z_loss_mlp": 7.59765625, "step": 15, "time_per_iteration": 2.596914768218994 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02122013, "balance_loss_mlp": 1.3682282, "epoch": 0.0030781069642170067, "flos": 596739944448.0, "grad_norm": 0.12347082932885804, "language_loss": 1.19854355, "learning_rate": 0.0005490431248454357, "loss": 1.21976352, "num_input_tokens_seen": 1116512, "router_z_loss_mlp": 7.53125, "step": 16, "time_per_iteration": 2.685072898864746 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02419001, "balance_loss_mlp": 1.67742407, "epoch": 0.0032704886494805694, "flos": 1538188102656.0, "grad_norm": 0.2736231848322761, "language_loss": 0.75705111, "learning_rate": 0.0005610483427624225, "loss": 0.78124118, "num_input_tokens_seen": 1351216, "router_z_loss_mlp": 7.40625, "step": 17, "time_per_iteration": 5.928683757781982 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02002798, "balance_loss_mlp": 1.29097593, "epoch": 0.0034628703347441324, "flos": 473969677824.0, "grad_norm": 0.09154168539226555, "language_loss": 1.06151795, "learning_rate": 0.0005723671632907488, "loss": 1.08154595, "num_input_tokens_seen": 1420512, "router_z_loss_mlp": 7.12109375, "step": 18, "time_per_iteration": 2.6618175506591797 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01945774, "balance_loss_mlp": 1.26141703, "epoch": 0.0036552520200076955, "flos": 448537554432.0, "grad_norm": 0.11342789334024792, "language_loss": 1.1168499, "learning_rate": 0.0005830738490244919, "loss": 1.13630772, "num_input_tokens_seen": 1484976, "router_z_loss_mlp": 6.8515625, "step": 19, "time_per_iteration": 2.5248160362243652 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01908107, "balance_loss_mlp": 1.24625731, "epoch": 0.003847633705271258, "flos": 636174217728.0, "grad_norm": 0.10096694408553891, "language_loss": 1.13845825, "learning_rate": 0.0005932312266435596, "loss": 1.15753937, "num_input_tokens_seen": 1557392, "router_z_loss_mlp": 6.62109375, "step": 20, "time_per_iteration": 2.800579309463501 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01843731, "balance_loss_mlp": 1.21316147, "epoch": 0.004040015390534821, "flos": 589495491072.0, "grad_norm": 0.1378013237236713, "language_loss": 1.09039617, "learning_rate": 0.0006028929207788754, "loss": 1.10883355, "num_input_tokens_seen": 1626064, "router_z_loss_mlp": 6.30078125, "step": 21, "time_per_iteration": 2.693075656890869 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01796963, "balance_loss_mlp": 1.19309616, "epoch": 0.004232397075798384, "flos": 756574940160.0, "grad_norm": 0.10529209836160877, "language_loss": 1.11936951, "learning_rate": 0.0006121050677327902, "loss": 1.13733912, "num_input_tokens_seen": 1696528, "router_z_loss_mlp": 6.03125, "step": 22, "time_per_iteration": 2.8881568908691406 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01746784, "balance_loss_mlp": 1.17724967, "epoch": 0.004424778761061947, "flos": 526692119040.0, "grad_norm": 0.085047282331545, "language_loss": 1.02962387, "learning_rate": 0.0006209076479463684, "loss": 1.04709172, "num_input_tokens_seen": 1765936, "router_z_loss_mlp": 5.70703125, "step": 23, "time_per_iteration": 2.630469799041748 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01714578, "balance_loss_mlp": 1.16831291, "epoch": 0.00461716044632551, "flos": 548168518656.0, "grad_norm": 0.1446104563316411, "language_loss": 1.12823486, "learning_rate": 0.0006293355346737718, "loss": 1.1453805, "num_input_tokens_seen": 1841632, "router_z_loss_mlp": 5.46875, "step": 24, "time_per_iteration": 2.662325382232666 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01664908, "balance_loss_mlp": 1.14725351, "epoch": 0.004809542131589073, "flos": 567584091648.0, "grad_norm": 0.08929005506461926, "language_loss": 1.08926165, "learning_rate": 0.0006374193284416834, "loss": 1.10591078, "num_input_tokens_seen": 1920256, "router_z_loss_mlp": 5.17578125, "step": 25, "time_per_iteration": 2.7794790267944336 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01647718, "balance_loss_mlp": 1.15752983, "epoch": 0.005001923816852636, "flos": 470629191168.0, "grad_norm": 0.382953647696995, "language_loss": 1.07588863, "learning_rate": 0.0006451860277489461, "loss": 1.09236586, "num_input_tokens_seen": 1986528, "router_z_loss_mlp": 4.89453125, "step": 26, "time_per_iteration": 2.6574552059173584 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01623745, "balance_loss_mlp": 1.1686517, "epoch": 0.005194305502116198, "flos": 415502701056.0, "grad_norm": 0.13377036730821817, "language_loss": 1.14740276, "learning_rate": 0.0006526595731190848, "loss": 1.16364002, "num_input_tokens_seen": 2048016, "router_z_loss_mlp": 4.55078125, "step": 27, "time_per_iteration": 2.5226099491119385 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01558493, "balance_loss_mlp": 1.14078379, "epoch": 0.005386687187379761, "flos": 628771548672.0, "grad_norm": 0.07887885702942038, "language_loss": 1.08901012, "learning_rate": 0.0006598612921618983, "loss": 1.10459495, "num_input_tokens_seen": 2127664, "router_z_loss_mlp": 4.18359375, "step": 28, "time_per_iteration": 2.839459180831909 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01503024, "balance_loss_mlp": 1.11487842, "epoch": 0.005579068872643324, "flos": 886483201536.0, "grad_norm": 0.08107526710192482, "language_loss": 1.0255661, "learning_rate": 0.0006668102665011454, "loss": 1.04059625, "num_input_tokens_seen": 2213952, "router_z_loss_mlp": 3.87695312, "step": 29, "time_per_iteration": 3.257913589477539 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01474291, "balance_loss_mlp": 1.11227608, "epoch": 0.005771450557906887, "flos": 547560622080.0, "grad_norm": 0.13697687064909753, "language_loss": 1.11483085, "learning_rate": 0.0006735236364718957, "loss": 1.1295737, "num_input_tokens_seen": 2284736, "router_z_loss_mlp": 3.6171875, "step": 30, "time_per_iteration": 2.7084178924560547 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0142553, "balance_loss_mlp": 1.09460521, "epoch": 0.00596383224317045, "flos": 532026620928.0, "grad_norm": 0.11726589989245696, "language_loss": 1.10265064, "learning_rate": 0.0006800168558381346, "loss": 1.11690593, "num_input_tokens_seen": 2354384, "router_z_loss_mlp": 3.31054688, "step": 31, "time_per_iteration": 2.588890552520752 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01390474, "balance_loss_mlp": 1.08758759, "epoch": 0.0061562139284340135, "flos": 589082886144.0, "grad_norm": 0.10666498872881085, "language_loss": 1.13109517, "learning_rate": 0.0006863039060567947, "loss": 1.14499998, "num_input_tokens_seen": 2419440, "router_z_loss_mlp": 3.0234375, "step": 32, "time_per_iteration": 2.671940326690674 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01372012, "balance_loss_mlp": 1.09372997, "epoch": 0.006348595613697576, "flos": 618231025152.0, "grad_norm": 0.09439068448398888, "language_loss": 1.06106949, "learning_rate": 0.0006923974775611263, "loss": 1.07478976, "num_input_tokens_seen": 2496368, "router_z_loss_mlp": 2.78710938, "step": 33, "time_per_iteration": 2.854475498199463 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01370442, "balance_loss_mlp": 1.11390388, "epoch": 0.006540977298961139, "flos": 777910376448.0, "grad_norm": 0.06215931521992215, "language_loss": 1.03014469, "learning_rate": 0.0006983091239737814, "loss": 1.04384923, "num_input_tokens_seen": 2573280, "router_z_loss_mlp": 2.56445312, "step": 34, "time_per_iteration": 3.0690298080444336 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01361344, "balance_loss_mlp": 1.12464166, "epoch": 0.006733358984224702, "flos": 667143475200.0, "grad_norm": 0.09515467516314563, "language_loss": 1.01683736, "learning_rate": 0.0007040493939600222, "loss": 1.03045082, "num_input_tokens_seen": 2647248, "router_z_loss_mlp": 2.36523438, "step": 35, "time_per_iteration": 2.8111989498138428 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01344012, "balance_loss_mlp": 1.12600231, "epoch": 0.006925740669488265, "flos": 564372085248.0, "grad_norm": 0.06987238068095514, "language_loss": 1.02534437, "learning_rate": 0.0007096279445021078, "loss": 1.0387845, "num_input_tokens_seen": 2720736, "router_z_loss_mlp": 2.18554688, "step": 36, "time_per_iteration": 2.704871654510498 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01340389, "balance_loss_mlp": 1.14107156, "epoch": 0.007118122354751828, "flos": 549887947776.0, "grad_norm": 0.1404335763188921, "language_loss": 1.09097314, "learning_rate": 0.0007150536386503726, "loss": 1.10437703, "num_input_tokens_seen": 2800336, "router_z_loss_mlp": 1.9921875, "step": 37, "time_per_iteration": 2.872793436050415 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01315876, "balance_loss_mlp": 1.13486814, "epoch": 0.007310504040015391, "flos": 702490973184.0, "grad_norm": 0.16061978088166937, "language_loss": 1.01896858, "learning_rate": 0.0007203346302358509, "loss": 1.0321275, "num_input_tokens_seen": 2883184, "router_z_loss_mlp": 1.81054688, "step": 38, "time_per_iteration": 2.9352476596832275 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01304512, "balance_loss_mlp": 1.13332772, "epoch": 0.007502885725278953, "flos": 599316890112.0, "grad_norm": 0.19798610454398824, "language_loss": 1.06942129, "learning_rate": 0.000725478437577282, "loss": 1.08246636, "num_input_tokens_seen": 2960736, "router_z_loss_mlp": 1.71386719, "step": 39, "time_per_iteration": 2.766380786895752 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01266397, "balance_loss_mlp": 1.10894561, "epoch": 0.007695267410542516, "flos": 560285309952.0, "grad_norm": 0.0682924496804484, "language_loss": 1.01676083, "learning_rate": 0.0007304920078549186, "loss": 1.02942467, "num_input_tokens_seen": 3033472, "router_z_loss_mlp": 1.57324219, "step": 40, "time_per_iteration": 2.7017316818237305 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01260084, "balance_loss_mlp": 1.10988009, "epoch": 0.007887649095806078, "flos": 508170765312.0, "grad_norm": 0.18661861035366387, "language_loss": 1.03648829, "learning_rate": 0.0007353817735343603, "loss": 1.04908907, "num_input_tokens_seen": 3107824, "router_z_loss_mlp": 1.50097656, "step": 41, "time_per_iteration": 2.7103593349456787 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01243555, "balance_loss_mlp": 1.10651195, "epoch": 0.008080030781069641, "flos": 503893840896.0, "grad_norm": 0.09436856387031409, "language_loss": 0.996611, "learning_rate": 0.0007401537019902344, "loss": 1.00904644, "num_input_tokens_seen": 3176528, "router_z_loss_mlp": 1.37109375, "step": 42, "time_per_iteration": 2.6113343238830566 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01223311, "balance_loss_mlp": 1.09961998, "epoch": 0.008272412466333205, "flos": 518031811584.0, "grad_norm": 0.12261468754490484, "language_loss": 1.02989793, "learning_rate": 0.0007448133392900729, "loss": 1.04213095, "num_input_tokens_seen": 3254256, "router_z_loss_mlp": 1.23535156, "step": 43, "time_per_iteration": 2.6736834049224854 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0123183, "balance_loss_mlp": 1.11490965, "epoch": 0.008464794151596768, "flos": 607974626304.0, "grad_norm": 0.06742287935331995, "language_loss": 0.98469728, "learning_rate": 0.0007493658489441491, "loss": 0.9970156, "num_input_tokens_seen": 3340224, "router_z_loss_mlp": 1.16699219, "step": 44, "time_per_iteration": 2.8660154342651367 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01221739, "balance_loss_mlp": 1.11549973, "epoch": 0.00865717583686033, "flos": 537929372160.0, "grad_norm": 0.13165016268944502, "language_loss": 1.02125764, "learning_rate": 0.0007538160463002316, "loss": 1.03347504, "num_input_tokens_seen": 3409216, "router_z_loss_mlp": 1.06445312, "step": 45, "time_per_iteration": 2.647026777267456 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01219104, "balance_loss_mlp": 1.12082767, "epoch": 0.008849557522123894, "flos": 508007780352.0, "grad_norm": 0.09154051415002856, "language_loss": 1.05303812, "learning_rate": 0.0007581684291577274, "loss": 1.06522906, "num_input_tokens_seen": 3478352, "router_z_loss_mlp": 0.98193359, "step": 46, "time_per_iteration": 2.5779762268066406 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01211973, "balance_loss_mlp": 1.12180293, "epoch": 0.009041939207387457, "flos": 625339657728.0, "grad_norm": 0.10098348979088022, "language_loss": 1.08761919, "learning_rate": 0.0007624272050891776, "loss": 1.09973884, "num_input_tokens_seen": 3555616, "router_z_loss_mlp": 0.90185547, "step": 47, "time_per_iteration": 2.8511393070220947 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01178324, "balance_loss_mlp": 1.09893048, "epoch": 0.00923432089265102, "flos": 549421014528.0, "grad_norm": 0.06288361982709323, "language_loss": 0.98731792, "learning_rate": 0.0007665963158851307, "loss": 0.9991011, "num_input_tokens_seen": 3634512, "router_z_loss_mlp": 0.79345703, "step": 48, "time_per_iteration": 2.7975704669952393 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0117803, "balance_loss_mlp": 1.10588408, "epoch": 0.009426702577914583, "flos": 562496638464.0, "grad_norm": 0.07935638516568921, "language_loss": 1.07018328, "learning_rate": 0.0007706794594783609, "loss": 1.08196378, "num_input_tokens_seen": 3708480, "router_z_loss_mlp": 0.72167969, "step": 49, "time_per_iteration": 2.762869358062744 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01170672, "balance_loss_mlp": 1.10281849, "epoch": 0.009619084263178146, "flos": 616773325824.0, "grad_norm": 0.06589219417940043, "language_loss": 1.06122911, "learning_rate": 0.0007746801096530423, "loss": 1.07293582, "num_input_tokens_seen": 3783472, "router_z_loss_mlp": 0.67919922, "step": 50, "time_per_iteration": 2.755232334136963 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0116692, "balance_loss_mlp": 1.10545588, "epoch": 0.009811465948441709, "flos": 541437986304.0, "grad_norm": 0.09337036144210262, "language_loss": 1.10751569, "learning_rate": 0.0007786015338021173, "loss": 1.11918497, "num_input_tokens_seen": 3851360, "router_z_loss_mlp": 0.61376953, "step": 51, "time_per_iteration": 2.6145899295806885 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01159208, "balance_loss_mlp": 1.10279799, "epoch": 0.010003847633705272, "flos": 535881028608.0, "grad_norm": 0.0700474736529942, "language_loss": 1.03127432, "learning_rate": 0.0007824468089603051, "loss": 1.04286635, "num_input_tokens_seen": 3923056, "router_z_loss_mlp": 0.56396484, "step": 52, "time_per_iteration": 2.653333902359009 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01162168, "balance_loss_mlp": 1.1128397, "epoch": 0.010196229318968833, "flos": 909254315520.0, "grad_norm": 0.0678828268350522, "language_loss": 1.02721131, "learning_rate": 0.0007862188363098669, "loss": 1.0388329, "num_input_tokens_seen": 4004528, "router_z_loss_mlp": 0.4934082, "step": 53, "time_per_iteration": 3.16854190826416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01150565, "balance_loss_mlp": 1.10464573, "epoch": 0.010388611004232396, "flos": 585868308480.0, "grad_norm": 0.07226768628462193, "language_loss": 1.03151178, "learning_rate": 0.0007899203543304438, "loss": 1.04301751, "num_input_tokens_seen": 4078704, "router_z_loss_mlp": 0.45947266, "step": 54, "time_per_iteration": 2.684342384338379 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01153237, "balance_loss_mlp": 1.10901022, "epoch": 0.01058099268949596, "flos": 502480558080.0, "grad_norm": 0.2877805661885644, "language_loss": 1.16480064, "learning_rate": 0.0007935539507422731, "loss": 1.17633295, "num_input_tokens_seen": 4143600, "router_z_loss_mlp": 0.44213867, "step": 55, "time_per_iteration": 2.550560235977173 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135293, "balance_loss_mlp": 1.09545326, "epoch": 0.010773374374759523, "flos": 544447360512.0, "grad_norm": 0.09011321470942846, "language_loss": 1.08752644, "learning_rate": 0.0007971220733732573, "loss": 1.09887934, "num_input_tokens_seen": 4217904, "router_z_loss_mlp": 0.39819336, "step": 56, "time_per_iteration": 2.6777026653289795 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138051, "balance_loss_mlp": 1.10307515, "epoch": 0.010965756060023086, "flos": 526155803136.0, "grad_norm": 0.08011479339587849, "language_loss": 1.04026377, "learning_rate": 0.0008006270400641869, "loss": 1.05164433, "num_input_tokens_seen": 4293920, "router_z_loss_mlp": 0.34985352, "step": 57, "time_per_iteration": 2.6899423599243164 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01140153, "balance_loss_mlp": 1.10787153, "epoch": 0.011158137745286649, "flos": 576941128704.0, "grad_norm": 0.11169369867739573, "language_loss": 1.05261517, "learning_rate": 0.0008040710477125043, "loss": 1.06401682, "num_input_tokens_seen": 4370080, "router_z_loss_mlp": 0.32275391, "step": 58, "time_per_iteration": 2.723038911819458 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01144338, "balance_loss_mlp": 1.11403465, "epoch": 0.011350519430550212, "flos": 529281547776.0, "grad_norm": 0.15034464280850074, "language_loss": 1.06417704, "learning_rate": 0.0008074561805429771, "loss": 1.07562041, "num_input_tokens_seen": 4439792, "router_z_loss_mlp": 0.30297852, "step": 59, "time_per_iteration": 2.6378283500671387 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136058, "balance_loss_mlp": 1.10842514, "epoch": 0.011542901115813775, "flos": 555879905280.0, "grad_norm": 0.12260992246729245, "language_loss": 1.03937411, "learning_rate": 0.0008107844176832545, "loss": 1.05073476, "num_input_tokens_seen": 4510800, "router_z_loss_mlp": 0.27612305, "step": 60, "time_per_iteration": 2.700141668319702 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01143848, "balance_loss_mlp": 1.11745548, "epoch": 0.011735282801077338, "flos": 572095954944.0, "grad_norm": 0.07189127634205647, "language_loss": 1.05365705, "learning_rate": 0.0008140576401132568, "loss": 1.06509542, "num_input_tokens_seen": 4581136, "router_z_loss_mlp": 0.2644043, "step": 61, "time_per_iteration": 2.6508264541625977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01141309, "balance_loss_mlp": 1.11781311, "epoch": 0.0119276644863409, "flos": 615589839360.0, "grad_norm": 0.05216073972873087, "language_loss": 1.06422329, "learning_rate": 0.0008172776370494935, "loss": 1.07563639, "num_input_tokens_seen": 4650352, "router_z_loss_mlp": 0.23461914, "step": 62, "time_per_iteration": 2.725492238998413 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136117, "balance_loss_mlp": 1.11272764, "epoch": 0.012120046171604464, "flos": 501084527616.0, "grad_norm": 0.101779425959611, "language_loss": 1.13612652, "learning_rate": 0.0008204461118185703, "loss": 1.14748764, "num_input_tokens_seen": 4716336, "router_z_loss_mlp": 0.23376465, "step": 63, "time_per_iteration": 2.5753746032714844 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01148279, "balance_loss_mlp": 1.12627339, "epoch": 0.012312427856868027, "flos": 473347100160.0, "grad_norm": 0.07447427381713748, "language_loss": 1.0324012, "learning_rate": 0.0008235646872681536, "loss": 1.04388404, "num_input_tokens_seen": 4781648, "router_z_loss_mlp": 0.22009277, "step": 64, "time_per_iteration": 2.5766890048980713 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01134709, "balance_loss_mlp": 1.11331069, "epoch": 0.012504809542131588, "flos": 538369141248.0, "grad_norm": 0.38827595406324295, "language_loss": 1.02755439, "learning_rate": 0.0008266349107584288, "loss": 1.03890157, "num_input_tokens_seen": 4852320, "router_z_loss_mlp": 0.2142334, "step": 65, "time_per_iteration": 2.6795432567596436 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01150765, "balance_loss_mlp": 1.12982011, "epoch": 0.012697191227395151, "flos": 608730826752.0, "grad_norm": 0.12495940986475743, "language_loss": 1.06208372, "learning_rate": 0.0008296582587724851, "loss": 1.07359147, "num_input_tokens_seen": 4922016, "router_z_loss_mlp": 0.20947266, "step": 66, "time_per_iteration": 2.7176458835601807 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01140545, "balance_loss_mlp": 1.11969519, "epoch": 0.012889572912658714, "flos": 768079065600.0, "grad_norm": 0.1040817091496257, "language_loss": 1.04495656, "learning_rate": 0.0008326361411800136, "loss": 1.05636215, "num_input_tokens_seen": 5000128, "router_z_loss_mlp": 0.20861816, "step": 67, "time_per_iteration": 2.944484233856201 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136624, "balance_loss_mlp": 1.11664486, "epoch": 0.013081954597922277, "flos": 533887013376.0, "grad_norm": 0.1236975736999165, "language_loss": 1.04613113, "learning_rate": 0.0008355699051851403, "loss": 1.05749726, "num_input_tokens_seen": 5074512, "router_z_loss_mlp": 0.1998291, "step": 68, "time_per_iteration": 2.7155401706695557 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01163949, "balance_loss_mlp": 1.14371967, "epoch": 0.01327433628318584, "flos": 573096632832.0, "grad_norm": 0.08669769947970225, "language_loss": 1.11325383, "learning_rate": 0.0008384608389860635, "loss": 1.12489343, "num_input_tokens_seen": 5141856, "router_z_loss_mlp": 0.20214844, "step": 69, "time_per_iteration": 2.6746206283569336 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01170727, "balance_loss_mlp": 1.15127182, "epoch": 0.013466717968449404, "flos": 497274536448.0, "grad_norm": 0.13494585106435908, "language_loss": 1.01927853, "learning_rate": 0.000841310175171381, "loss": 1.03098571, "num_input_tokens_seen": 5209280, "router_z_loss_mlp": 0.19433594, "step": 70, "time_per_iteration": 2.6096978187561035 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0116458, "balance_loss_mlp": 1.14537501, "epoch": 0.013659099653712967, "flos": 565511155200.0, "grad_norm": 0.08071853308807045, "language_loss": 0.99831259, "learning_rate": 0.000844119093875517, "loss": 1.00995839, "num_input_tokens_seen": 5285424, "router_z_loss_mlp": 0.19189453, "step": 71, "time_per_iteration": 2.7110228538513184 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01172694, "balance_loss_mlp": 1.1531322, "epoch": 0.01385148133897653, "flos": 573820526592.0, "grad_norm": 0.1298896621631551, "language_loss": 1.05077183, "learning_rate": 0.0008468887257134666, "loss": 1.06249881, "num_input_tokens_seen": 5358624, "router_z_loss_mlp": 0.19543457, "step": 72, "time_per_iteration": 2.6877832412719727 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0117331, "balance_loss_mlp": 1.15338969, "epoch": 0.014043863024240093, "flos": 576822560256.0, "grad_norm": 0.15655470084299106, "language_loss": 1.07319438, "learning_rate": 0.0008496201545131264, "loss": 1.08492744, "num_input_tokens_seen": 5429792, "router_z_loss_mlp": 0.19909668, "step": 73, "time_per_iteration": 2.712404251098633 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01155518, "balance_loss_mlp": 1.13590837, "epoch": 0.014236244709503656, "flos": 938681809920.0, "grad_norm": 0.16190508579873739, "language_loss": 1.04767108, "learning_rate": 0.0008523144198617317, "loss": 1.05922627, "num_input_tokens_seen": 5518608, "router_z_loss_mlp": 0.19604492, "step": 74, "time_per_iteration": 3.1923534870147705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136399, "balance_loss_mlp": 1.11624122, "epoch": 0.014428626394767219, "flos": 528483502080.0, "grad_norm": 0.09478832041488004, "language_loss": 1.04861999, "learning_rate": 0.0008549725194813783, "loss": 1.05998397, "num_input_tokens_seen": 5590576, "router_z_loss_mlp": 0.20153809, "step": 75, "time_per_iteration": 2.6708076000213623 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01116508, "balance_loss_mlp": 1.09800684, "epoch": 0.014621008080030782, "flos": 803752533504.0, "grad_norm": 0.08770819878028477, "language_loss": 1.03907192, "learning_rate": 0.0008575954114472099, "loss": 1.05023694, "num_input_tokens_seen": 5674224, "router_z_loss_mlp": 0.18481445, "step": 76, "time_per_iteration": 3.13152813911438 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115762, "balance_loss_mlp": 1.09717751, "epoch": 0.014813389765294343, "flos": 696941356032.0, "grad_norm": 0.13848190952411177, "language_loss": 1.01474786, "learning_rate": 0.0008601840162606118, "loss": 1.02590549, "num_input_tokens_seen": 5757648, "router_z_loss_mlp": 0.18591309, "step": 77, "time_per_iteration": 3.0026464462280273 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126663, "balance_loss_mlp": 1.10745883, "epoch": 0.015005771450557906, "flos": 596994333696.0, "grad_norm": 0.04300320251384177, "language_loss": 1.07548404, "learning_rate": 0.000862739218788641, "loss": 1.08675063, "num_input_tokens_seen": 5837600, "router_z_loss_mlp": 0.19189453, "step": 78, "time_per_iteration": 2.780151128768921 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136666, "balance_loss_mlp": 1.11736631, "epoch": 0.01519815313582147, "flos": 549416245248.0, "grad_norm": 0.05300805683051922, "language_loss": 1.05217659, "learning_rate": 0.0008652618700799138, "loss": 1.0635432, "num_input_tokens_seen": 5907248, "router_z_loss_mlp": 0.19287109, "step": 79, "time_per_iteration": 2.644989252090454 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0115046, "balance_loss_mlp": 1.13105261, "epoch": 0.015390534821085032, "flos": 430532692992.0, "grad_norm": 0.13679514692214284, "language_loss": 1.04483461, "learning_rate": 0.0008677527890662774, "loss": 1.05633926, "num_input_tokens_seen": 5970864, "router_z_loss_mlp": 0.19384766, "step": 80, "time_per_iteration": 2.4652533531188965 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01151484, "balance_loss_mlp": 1.13120639, "epoch": 0.015582916506348595, "flos": 524119942656.0, "grad_norm": 0.06949005945359786, "language_loss": 1.05593443, "learning_rate": 0.0008702127641587799, "loss": 1.06744933, "num_input_tokens_seen": 6040800, "router_z_loss_mlp": 0.20263672, "step": 81, "time_per_iteration": 2.6423192024230957 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01155894, "balance_loss_mlp": 1.13492513, "epoch": 0.015775298191612157, "flos": 575443782144.0, "grad_norm": 0.09507058081046676, "language_loss": 1.01514888, "learning_rate": 0.0008726425547457192, "loss": 1.02670789, "num_input_tokens_seen": 6111840, "router_z_loss_mlp": 0.20959473, "step": 82, "time_per_iteration": 2.7670798301696777 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01133891, "balance_loss_mlp": 1.11376882, "epoch": 0.01596767987687572, "flos": 610319577600.0, "grad_norm": 0.0793725108169458, "language_loss": 1.00304663, "learning_rate": 0.0008750428925998964, "loss": 1.01438546, "num_input_tokens_seen": 6183872, "router_z_loss_mlp": 0.20117188, "step": 83, "time_per_iteration": 2.7451062202453613 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01145272, "balance_loss_mlp": 1.12516141, "epoch": 0.016160061562139283, "flos": 567136982016.0, "grad_norm": 0.14534943996774727, "language_loss": 1.06251049, "learning_rate": 0.0008774144832015932, "loss": 1.07396317, "num_input_tokens_seen": 6255760, "router_z_loss_mlp": 0.2010498, "step": 84, "time_per_iteration": 2.7039954662323 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01784137, "balance_loss_mlp": 1.77116704, "epoch": 0.016352443247402846, "flos": 1411343543808.0, "grad_norm": 0.33978769388161495, "language_loss": 0.74774313, "learning_rate": 0.0008797580069832641, "loss": 0.76558447, "num_input_tokens_seen": 6472960, "router_z_loss_mlp": 0.12988281, "step": 85, "time_per_iteration": 4.672428846359253 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01133263, "balance_loss_mlp": 1.11339045, "epoch": 0.01654482493266641, "flos": 730497844224.0, "grad_norm": 0.0814354491433929, "language_loss": 1.01647198, "learning_rate": 0.0008820741205014318, "loss": 1.02780461, "num_input_tokens_seen": 6548912, "router_z_loss_mlp": 0.19873047, "step": 86, "time_per_iteration": 2.9217472076416016 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135249, "balance_loss_mlp": 1.11522174, "epoch": 0.016737206617929972, "flos": 536293633536.0, "grad_norm": 0.09136661427056217, "language_loss": 1.02933669, "learning_rate": 0.0008843634575408404, "loss": 1.04068923, "num_input_tokens_seen": 6621520, "router_z_loss_mlp": 0.20031738, "step": 87, "time_per_iteration": 2.7795376777648926 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126576, "balance_loss_mlp": 1.10805094, "epoch": 0.016929588303193535, "flos": 536990363136.0, "grad_norm": 0.08653972064742017, "language_loss": 1.04609084, "learning_rate": 0.0008866266301555082, "loss": 1.0573566, "num_input_tokens_seen": 6698432, "router_z_loss_mlp": 0.18518066, "step": 88, "time_per_iteration": 2.7490010261535645 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01144591, "balance_loss_mlp": 1.12630451, "epoch": 0.017121969988457098, "flos": 526756359168.0, "grad_norm": 0.0643644920813647, "language_loss": 1.05052233, "learning_rate": 0.0008888642296509615, "loss": 1.06196821, "num_input_tokens_seen": 6764336, "router_z_loss_mlp": 0.18273926, "step": 89, "time_per_iteration": 2.594862222671509 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01167485, "balance_loss_mlp": 1.14840007, "epoch": 0.01731435167372066, "flos": 625596618240.0, "grad_norm": 0.0960094219381758, "language_loss": 1.09507632, "learning_rate": 0.0008910768275115906, "loss": 1.10675108, "num_input_tokens_seen": 6839392, "router_z_loss_mlp": 0.1907959, "step": 90, "time_per_iteration": 2.732243299484253 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01168948, "balance_loss_mlp": 1.14970791, "epoch": 0.017506733358984224, "flos": 496402338816.0, "grad_norm": 0.08670111946866453, "language_loss": 1.05579484, "learning_rate": 0.0008932649762767675, "loss": 1.06748414, "num_input_tokens_seen": 6907344, "router_z_loss_mlp": 0.19238281, "step": 91, "time_per_iteration": 2.58011531829834 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01156938, "balance_loss_mlp": 1.13799536, "epoch": 0.017699115044247787, "flos": 745933100544.0, "grad_norm": 0.1377326340865385, "language_loss": 1.07988524, "learning_rate": 0.0008954292103690864, "loss": 1.09145451, "num_input_tokens_seen": 6982464, "router_z_loss_mlp": 0.18933105, "step": 92, "time_per_iteration": 2.88777494430542 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01144865, "balance_loss_mlp": 1.12581539, "epoch": 0.01789149672951135, "flos": 515509194240.0, "grad_norm": 0.08013614344713903, "language_loss": 1.10040021, "learning_rate": 0.0008975700468778296, "loss": 1.11184883, "num_input_tokens_seen": 7049712, "router_z_loss_mlp": 0.19042969, "step": 93, "time_per_iteration": 2.5774590969085693 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01153192, "balance_loss_mlp": 1.13429725, "epoch": 0.018083878414774913, "flos": 586125268992.0, "grad_norm": 0.08120240816831911, "language_loss": 1.03244281, "learning_rate": 0.0008996879863005366, "loss": 1.04397476, "num_input_tokens_seen": 7120288, "router_z_loss_mlp": 0.18896484, "step": 94, "time_per_iteration": 2.6684646606445312 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01166139, "balance_loss_mlp": 1.14685082, "epoch": 0.018276260100038477, "flos": 497356028928.0, "grad_norm": 0.10696755240582503, "language_loss": 1.0365541, "learning_rate": 0.0009017835132453337, "loss": 1.04821539, "num_input_tokens_seen": 7188896, "router_z_loss_mlp": 0.19262695, "step": 95, "time_per_iteration": 2.5731871128082275 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01160643, "balance_loss_mlp": 1.14130712, "epoch": 0.01846864178530204, "flos": 640058360832.0, "grad_norm": 0.09689172385373614, "language_loss": 1.03809953, "learning_rate": 0.0009038570970964896, "loss": 1.04970598, "num_input_tokens_seen": 7259536, "router_z_loss_mlp": 0.1932373, "step": 96, "time_per_iteration": 2.7642133235931396 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01142174, "balance_loss_mlp": 1.1226114, "epoch": 0.018661023470565603, "flos": 511662127104.0, "grad_norm": 0.0731237284630876, "language_loss": 1.01012015, "learning_rate": 0.0009059091926454854, "loss": 1.02154183, "num_input_tokens_seen": 7326752, "router_z_loss_mlp": 0.19543457, "step": 97, "time_per_iteration": 2.5798768997192383 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01134737, "balance_loss_mlp": 1.11522222, "epoch": 0.018853405155829166, "flos": 931106244096.0, "grad_norm": 0.09616120207899966, "language_loss": 1.00179553, "learning_rate": 0.0009079402406897198, "loss": 1.01314282, "num_input_tokens_seen": 7417488, "router_z_loss_mlp": 0.19494629, "step": 98, "time_per_iteration": 3.2566075325012207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01143337, "balance_loss_mlp": 1.12357211, "epoch": 0.01904578684109273, "flos": 576484107264.0, "grad_norm": 0.06455780129345397, "language_loss": 1.01265812, "learning_rate": 0.0009099506686008212, "loss": 1.02409148, "num_input_tokens_seen": 7493136, "router_z_loss_mlp": 0.19763184, "step": 99, "time_per_iteration": 2.799565553665161 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01129571, "balance_loss_mlp": 1.11054564, "epoch": 0.019238168526356292, "flos": 558442169856.0, "grad_norm": 0.10657448879387016, "language_loss": 1.0467732, "learning_rate": 0.0009119408908644013, "loss": 1.05806899, "num_input_tokens_seen": 7560896, "router_z_loss_mlp": 0.19030762, "step": 100, "time_per_iteration": 2.684875249862671 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01122425, "balance_loss_mlp": 1.10363734, "epoch": 0.019430550211619855, "flos": 723851375616.0, "grad_norm": 0.06970738765852934, "language_loss": 1.09725833, "learning_rate": 0.0009139113095929519, "loss": 1.1084826, "num_input_tokens_seen": 7629040, "router_z_loss_mlp": 0.18762207, "step": 101, "time_per_iteration": 2.8530783653259277 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130466, "balance_loss_mlp": 1.11095107, "epoch": 0.019622931896883418, "flos": 499478524416.0, "grad_norm": 0.04951217111237057, "language_loss": 1.03750157, "learning_rate": 0.0009158623150134762, "loss": 1.04880619, "num_input_tokens_seen": 7694256, "router_z_loss_mlp": 0.19506836, "step": 102, "time_per_iteration": 2.5738718509674072 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124372, "balance_loss_mlp": 1.10552466, "epoch": 0.01981531358214698, "flos": 509188695552.0, "grad_norm": 0.07829016079597523, "language_loss": 1.03829539, "learning_rate": 0.000917794285931332, "loss": 1.04953909, "num_input_tokens_seen": 7762256, "router_z_loss_mlp": 0.18859863, "step": 103, "time_per_iteration": 2.6672050952911377 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01116034, "balance_loss_mlp": 1.09756863, "epoch": 0.020007695267410544, "flos": 521347705344.0, "grad_norm": 0.06055754000551873, "language_loss": 0.96430528, "learning_rate": 0.0009197075901716639, "loss": 0.97546566, "num_input_tokens_seen": 7834400, "router_z_loss_mlp": 0.18444824, "step": 104, "time_per_iteration": 2.7030909061431885 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01143399, "balance_loss_mlp": 1.12458754, "epoch": 0.020200076952674107, "flos": 533298940416.0, "grad_norm": 0.08851166873462187, "language_loss": 1.06492853, "learning_rate": 0.0009216025849997171, "loss": 1.07636249, "num_input_tokens_seen": 7911184, "router_z_loss_mlp": 0.18798828, "step": 105, "time_per_iteration": 2.770717144012451 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136641, "balance_loss_mlp": 1.11799645, "epoch": 0.020392458637937667, "flos": 684760324608.0, "grad_norm": 0.1087806769155691, "language_loss": 1.01426148, "learning_rate": 0.0009234796175212258, "loss": 1.02562797, "num_input_tokens_seen": 7985280, "router_z_loss_mlp": 0.18640137, "step": 106, "time_per_iteration": 2.9345030784606934 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01145469, "balance_loss_mlp": 1.12691963, "epoch": 0.02058484032320123, "flos": 702115444224.0, "grad_norm": 0.08314221817588373, "language_loss": 1.04264343, "learning_rate": 0.000925339025064007, "loss": 1.05409813, "num_input_tokens_seen": 8068320, "router_z_loss_mlp": 0.18530273, "step": 107, "time_per_iteration": 2.9724230766296387 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136133, "balance_loss_mlp": 1.11766744, "epoch": 0.020777222008464793, "flos": 639082275840.0, "grad_norm": 0.06103111074840472, "language_loss": 0.9746207, "learning_rate": 0.0009271811355418027, "loss": 0.98598194, "num_input_tokens_seen": 8148144, "router_z_loss_mlp": 0.18457031, "step": 108, "time_per_iteration": 2.8312766551971436 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0114122, "balance_loss_mlp": 1.12251627, "epoch": 0.020969603693728356, "flos": 682091974656.0, "grad_norm": 0.09366723049874563, "language_loss": 1.0430491, "learning_rate": 0.0009290062678013548, "loss": 1.05446124, "num_input_tokens_seen": 8222256, "router_z_loss_mlp": 0.18713379, "step": 109, "time_per_iteration": 2.8890299797058105 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119997, "balance_loss_mlp": 1.10091138, "epoch": 0.02116198537899192, "flos": 533395487232.0, "grad_norm": 0.07845117671788823, "language_loss": 1.02498507, "learning_rate": 0.0009308147319536321, "loss": 1.03618503, "num_input_tokens_seen": 8292432, "router_z_loss_mlp": 0.19067383, "step": 110, "time_per_iteration": 2.6301145553588867 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124178, "balance_loss_mlp": 1.10517561, "epoch": 0.021354367064255482, "flos": 717479119872.0, "grad_norm": 0.06169483511964636, "language_loss": 1.08628201, "learning_rate": 0.0009326068296900676, "loss": 1.09752393, "num_input_tokens_seen": 8365024, "router_z_loss_mlp": 0.18981934, "step": 111, "time_per_iteration": 2.8480148315429688 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124091, "balance_loss_mlp": 1.1046958, "epoch": 0.021546748749519045, "flos": 519556322304.0, "grad_norm": 0.07277353768082521, "language_loss": 1.00328588, "learning_rate": 0.0009343828545846161, "loss": 1.01452684, "num_input_tokens_seen": 8442448, "router_z_loss_mlp": 0.19384766, "step": 112, "time_per_iteration": 2.785245656967163 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01145687, "balance_loss_mlp": 1.12596965, "epoch": 0.021739130434782608, "flos": 505161391104.0, "grad_norm": 0.0989159829516975, "language_loss": 1.03963184, "learning_rate": 0.0009361430923823841, "loss": 1.05108869, "num_input_tokens_seen": 8508992, "router_z_loss_mlp": 0.19702148, "step": 113, "time_per_iteration": 2.6218817234039307 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01139838, "balance_loss_mlp": 1.11994159, "epoch": 0.02193151212004617, "flos": 463486053888.0, "grad_norm": 0.08134488401387123, "language_loss": 1.07289195, "learning_rate": 0.0009378878212755459, "loss": 1.08429039, "num_input_tokens_seen": 8574048, "router_z_loss_mlp": 0.19885254, "step": 114, "time_per_iteration": 2.489394426345825 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135841, "balance_loss_mlp": 1.11546779, "epoch": 0.022123893805309734, "flos": 552272546304.0, "grad_norm": 0.08931795851274972, "language_loss": 0.98084462, "learning_rate": 0.0009396173121672103, "loss": 0.992203, "num_input_tokens_seen": 8647808, "router_z_loss_mlp": 0.20373535, "step": 115, "time_per_iteration": 2.6338186264038086 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01132046, "balance_loss_mlp": 1.11229324, "epoch": 0.022316275490573297, "flos": 636211293696.0, "grad_norm": 0.07784948028132394, "language_loss": 1.03230667, "learning_rate": 0.0009413318289238633, "loss": 1.04362714, "num_input_tokens_seen": 8719760, "router_z_loss_mlp": 0.1973877, "step": 116, "time_per_iteration": 2.7797064781188965 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119746, "balance_loss_mlp": 1.10049319, "epoch": 0.02250865717583686, "flos": 798890107392.0, "grad_norm": 0.10235619274826367, "language_loss": 0.95674431, "learning_rate": 0.0009430316286169771, "loss": 0.96794176, "num_input_tokens_seen": 8798752, "router_z_loss_mlp": 0.19226074, "step": 117, "time_per_iteration": 3.0148251056671143 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01123727, "balance_loss_mlp": 1.10400951, "epoch": 0.022701038861100423, "flos": 456093296640.0, "grad_norm": 0.08556933686221588, "language_loss": 1.00759292, "learning_rate": 0.0009447169617543361, "loss": 1.0188303, "num_input_tokens_seen": 8866848, "router_z_loss_mlp": 0.19714355, "step": 118, "time_per_iteration": 2.570577383041382 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01147362, "balance_loss_mlp": 1.12738276, "epoch": 0.022893420546363986, "flos": 583086159360.0, "grad_norm": 0.14195532580527156, "language_loss": 1.07468402, "learning_rate": 0.0009463880725016029, "loss": 1.08615768, "num_input_tokens_seen": 8935488, "router_z_loss_mlp": 0.19970703, "step": 119, "time_per_iteration": 2.687791585922241 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119491, "balance_loss_mlp": 1.1002152, "epoch": 0.02308580223162755, "flos": 561303240192.0, "grad_norm": 0.12580227983012474, "language_loss": 1.02723956, "learning_rate": 0.0009480451988946134, "loss": 1.03843451, "num_input_tokens_seen": 9015344, "router_z_loss_mlp": 0.19274902, "step": 120, "time_per_iteration": 2.86080002784729 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118723, "balance_loss_mlp": 1.09974504, "epoch": 0.023278183916891113, "flos": 771300983808.0, "grad_norm": 0.09779732210141849, "language_loss": 1.04102588, "learning_rate": 0.0009496885730428627, "loss": 1.05221319, "num_input_tokens_seen": 9094672, "router_z_loss_mlp": 0.1895752, "step": 121, "time_per_iteration": 3.058720350265503 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01129608, "balance_loss_mlp": 1.11076128, "epoch": 0.023470565602154676, "flos": 553374540288.0, "grad_norm": 0.21300696817673925, "language_loss": 1.02294064, "learning_rate": 0.0009513184213246156, "loss": 1.03423667, "num_input_tokens_seen": 9160608, "router_z_loss_mlp": 0.18859863, "step": 122, "time_per_iteration": 2.634585380554199 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112851, "balance_loss_mlp": 1.10879278, "epoch": 0.02366294728741824, "flos": 560028349440.0, "grad_norm": 0.08876505507315528, "language_loss": 1.05331969, "learning_rate": 0.0009529349645740552, "loss": 1.06460488, "num_input_tokens_seen": 9228704, "router_z_loss_mlp": 0.19702148, "step": 123, "time_per_iteration": 2.68062686920166 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01139736, "balance_loss_mlp": 1.11948287, "epoch": 0.0238553289726818, "flos": 468553683456.0, "grad_norm": 0.07165211399576038, "language_loss": 1.04294729, "learning_rate": 0.0009545384182608524, "loss": 1.05434453, "num_input_tokens_seen": 9294288, "router_z_loss_mlp": 0.20239258, "step": 124, "time_per_iteration": 2.541867971420288 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01147945, "balance_loss_mlp": 1.12758446, "epoch": 0.024047710657945365, "flos": 560030920704.0, "grad_norm": 0.1170262954091428, "language_loss": 1.01733518, "learning_rate": 0.0009561289926625252, "loss": 1.02881455, "num_input_tokens_seen": 9368048, "router_z_loss_mlp": 0.20361328, "step": 125, "time_per_iteration": 2.6904866695404053 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01144311, "balance_loss_mlp": 1.12337756, "epoch": 0.024240092343208928, "flos": 504775950336.0, "grad_norm": 0.0767802787123007, "language_loss": 1.06512678, "learning_rate": 0.0009577068930299292, "loss": 1.07656991, "num_input_tokens_seen": 9434848, "router_z_loss_mlp": 0.20935059, "step": 126, "time_per_iteration": 2.5956666469573975 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112026, "balance_loss_mlp": 1.10011339, "epoch": 0.02443247402847249, "flos": 435763307520.0, "grad_norm": 0.05578094289714296, "language_loss": 1.01563096, "learning_rate": 0.0009592723197462087, "loss": 1.02683353, "num_input_tokens_seen": 9504112, "router_z_loss_mlp": 0.20141602, "step": 127, "time_per_iteration": 2.652282953262329 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135813, "balance_loss_mlp": 1.11633444, "epoch": 0.024624855713736054, "flos": 683769558528.0, "grad_norm": 0.08941911012616197, "language_loss": 0.98464531, "learning_rate": 0.0009608254684795125, "loss": 0.99600339, "num_input_tokens_seen": 9590032, "router_z_loss_mlp": 0.19470215, "step": 128, "time_per_iteration": 2.9219348430633545 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0113214, "balance_loss_mlp": 1.11204123, "epoch": 0.024817237398999614, "flos": 524999480832.0, "grad_norm": 0.07851670709976168, "language_loss": 1.01339173, "learning_rate": 0.0009623665303297678, "loss": 1.02471328, "num_input_tokens_seen": 9663040, "router_z_loss_mlp": 0.20092773, "step": 129, "time_per_iteration": 2.72129225730896 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138949, "balance_loss_mlp": 1.11936343, "epoch": 0.025009619084263177, "flos": 655656602112.0, "grad_norm": 0.10234054898828188, "language_loss": 1.05215728, "learning_rate": 0.0009638956919697878, "loss": 1.0635469, "num_input_tokens_seen": 9736544, "router_z_loss_mlp": 0.19580078, "step": 130, "time_per_iteration": 2.8943347930908203 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01120076, "balance_loss_mlp": 1.10040641, "epoch": 0.02520200076952674, "flos": 454423053312.0, "grad_norm": 0.07955649128739337, "language_loss": 0.97532988, "learning_rate": 0.0009654131357809714, "loss": 0.98653066, "num_input_tokens_seen": 9804656, "router_z_loss_mlp": 0.19665527, "step": 131, "time_per_iteration": 2.5710790157318115 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01131377, "balance_loss_mlp": 1.11108756, "epoch": 0.025394382454790303, "flos": 839794563072.0, "grad_norm": 0.09603534709419483, "language_loss": 1.06830871, "learning_rate": 0.0009669190399838441, "loss": 1.07962251, "num_input_tokens_seen": 9888864, "router_z_loss_mlp": 0.20275879, "step": 132, "time_per_iteration": 3.12355899810791 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01104613, "balance_loss_mlp": 1.08422863, "epoch": 0.025586764140053866, "flos": 581025332736.0, "grad_norm": 0.07678679730921736, "language_loss": 0.99635059, "learning_rate": 0.0009684135787636724, "loss": 1.0073967, "num_input_tokens_seen": 9968208, "router_z_loss_mlp": 0.20373535, "step": 133, "time_per_iteration": 2.8190038204193115 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.011225, "balance_loss_mlp": 1.10198379, "epoch": 0.02577914582531743, "flos": 790249623552.0, "grad_norm": 0.06194161941979751, "language_loss": 1.03999257, "learning_rate": 0.0009698969223913726, "loss": 1.05121756, "num_input_tokens_seen": 10049664, "router_z_loss_mlp": 0.2052002, "step": 134, "time_per_iteration": 3.0173001289367676 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111055, "balance_loss_mlp": 1.09066617, "epoch": 0.025971527510580992, "flos": 594958473216.0, "grad_norm": 0.06876216863310104, "language_loss": 1.06792855, "learning_rate": 0.0009713692373399265, "loss": 1.07903397, "num_input_tokens_seen": 10120096, "router_z_loss_mlp": 0.19873047, "step": 135, "time_per_iteration": 2.670929431915283 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0134721, "balance_loss_mlp": 1.33280921, "epoch": 0.026163909195844555, "flos": 1577629716480.0, "grad_norm": 0.15411027982306336, "language_loss": 0.79456228, "learning_rate": 0.0009728306863964993, "loss": 0.80803436, "num_input_tokens_seen": 10348976, "router_z_loss_mlp": 0.14355469, "step": 136, "time_per_iteration": 5.4502341747283936 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01142923, "balance_loss_mlp": 1.13023889, "epoch": 0.026356290881108118, "flos": 1502074865664.0, "grad_norm": 0.0420308652143082, "language_loss": 0.77811038, "learning_rate": 0.0009742814287704512, "loss": 0.78953964, "num_input_tokens_seen": 10576512, "router_z_loss_mlp": 0.12695312, "step": 137, "time_per_iteration": 4.911421298980713 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01140624, "balance_loss_mlp": 1.1204778, "epoch": 0.02654867256637168, "flos": 597140066304.0, "grad_norm": 0.15008184892874737, "language_loss": 0.99414909, "learning_rate": 0.0009757216201974225, "loss": 1.00555539, "num_input_tokens_seen": 10659168, "router_z_loss_mlp": 0.20141602, "step": 138, "time_per_iteration": 2.805294990539551 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01163113, "balance_loss_mlp": 1.1417979, "epoch": 0.026741054251635244, "flos": 545035433472.0, "grad_norm": 0.10042691837700132, "language_loss": 1.04683781, "learning_rate": 0.0009771514130396581, "loss": 1.05846894, "num_input_tokens_seen": 10731584, "router_z_loss_mlp": 0.21325684, "step": 139, "time_per_iteration": 2.6785237789154053 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01171786, "balance_loss_mlp": 1.15150893, "epoch": 0.026933435936898807, "flos": 506841546240.0, "grad_norm": 0.13712828131438198, "language_loss": 1.04777944, "learning_rate": 0.00097857095638274, "loss": 1.05949712, "num_input_tokens_seen": 10799456, "router_z_loss_mlp": 0.20275879, "step": 140, "time_per_iteration": 2.5689632892608643 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01161751, "balance_loss_mlp": 1.140818, "epoch": 0.02712581762216237, "flos": 740860328448.0, "grad_norm": 0.04776427930188189, "language_loss": 0.96152979, "learning_rate": 0.0009799803961288726, "loss": 0.97314727, "num_input_tokens_seen": 10886416, "router_z_loss_mlp": 0.20922852, "step": 141, "time_per_iteration": 3.005524158477783 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0114311, "balance_loss_mlp": 1.12280869, "epoch": 0.027318199307425933, "flos": 848373378048.0, "grad_norm": 0.08242063446041879, "language_loss": 1.02058709, "learning_rate": 0.000981379875086876, "loss": 1.03201818, "num_input_tokens_seen": 10966064, "router_z_loss_mlp": 0.20300293, "step": 142, "time_per_iteration": 3.0404272079467773 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01149559, "balance_loss_mlp": 1.12884021, "epoch": 0.027510580992689496, "flos": 575557581312.0, "grad_norm": 0.08811908081945614, "language_loss": 0.97007114, "learning_rate": 0.0009827695330590185, "loss": 0.98156673, "num_input_tokens_seen": 11039712, "router_z_loss_mlp": 0.20727539, "step": 143, "time_per_iteration": 2.677872896194458 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01139202, "balance_loss_mlp": 1.11838782, "epoch": 0.02770296267795306, "flos": 772420230144.0, "grad_norm": 0.09095558281985278, "language_loss": 0.9660008, "learning_rate": 0.0009841495069248256, "loss": 0.97739279, "num_input_tokens_seen": 11123984, "router_z_loss_mlp": 0.20788574, "step": 144, "time_per_iteration": 3.0181970596313477 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124304, "balance_loss_mlp": 1.10402668, "epoch": 0.027895344363216622, "flos": 569387957760.0, "grad_norm": 0.06968867614461936, "language_loss": 0.96011639, "learning_rate": 0.0009855199307219871, "loss": 0.97135949, "num_input_tokens_seen": 11192864, "router_z_loss_mlp": 0.20275879, "step": 145, "time_per_iteration": 2.6638803482055664 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01129462, "balance_loss_mlp": 1.10819507, "epoch": 0.028087726048480186, "flos": 547360561152.0, "grad_norm": 0.10380696742567494, "language_loss": 0.97768301, "learning_rate": 0.0009868809357244854, "loss": 0.98897767, "num_input_tokens_seen": 11261760, "router_z_loss_mlp": 0.21264648, "step": 146, "time_per_iteration": 2.6609416007995605 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108626, "balance_loss_mlp": 1.08754969, "epoch": 0.02828010773374375, "flos": 524789508096.0, "grad_norm": 0.04767435219925792, "language_loss": 1.01976728, "learning_rate": 0.0009882326505180556, "loss": 1.03085351, "num_input_tokens_seen": 11334736, "router_z_loss_mlp": 0.21081543, "step": 147, "time_per_iteration": 2.7018306255340576 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01116059, "balance_loss_mlp": 1.09487534, "epoch": 0.02847248941900731, "flos": 772440053760.0, "grad_norm": 0.081387986355653, "language_loss": 1.0020777, "learning_rate": 0.0009895752010730906, "loss": 1.01323831, "num_input_tokens_seen": 11409872, "router_z_loss_mlp": 0.21191406, "step": 148, "time_per_iteration": 2.9776458740234375 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01114545, "balance_loss_mlp": 1.09280121, "epoch": 0.028664871104270875, "flos": 534413417472.0, "grad_norm": 0.07164111136345892, "language_loss": 1.06547272, "learning_rate": 0.0009909087108150867, "loss": 1.07661819, "num_input_tokens_seen": 11481024, "router_z_loss_mlp": 0.21740723, "step": 149, "time_per_iteration": 2.7685787677764893 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01120998, "balance_loss_mlp": 1.09932601, "epoch": 0.028857252789534438, "flos": 367766396928.0, "grad_norm": 0.09002123643314056, "language_loss": 1.07463562, "learning_rate": 0.0009922333006927371, "loss": 1.08584571, "num_input_tokens_seen": 11544240, "router_z_loss_mlp": 0.2166748, "step": 150, "time_per_iteration": 2.5377442836761475 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01134752, "balance_loss_mlp": 1.11268604, "epoch": 0.029049634474798, "flos": 515482030080.0, "grad_norm": 0.07882603128859848, "language_loss": 1.00827551, "learning_rate": 0.0009935490892437632, "loss": 1.01962304, "num_input_tokens_seen": 11610416, "router_z_loss_mlp": 0.22070312, "step": 151, "time_per_iteration": 2.5629055500030518 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126049, "balance_loss_mlp": 1.10497248, "epoch": 0.029242016160061564, "flos": 588141305856.0, "grad_norm": 0.07540534084758796, "language_loss": 0.99210167, "learning_rate": 0.0009948561926585687, "loss": 1.00336218, "num_input_tokens_seen": 11687488, "router_z_loss_mlp": 0.21069336, "step": 152, "time_per_iteration": 2.755824565887451 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01133957, "balance_loss_mlp": 1.1110214, "epoch": 0.029434397845325123, "flos": 552079825920.0, "grad_norm": 0.09890448438657973, "language_loss": 1.02627087, "learning_rate": 0.0009961547248418122, "loss": 1.03761053, "num_input_tokens_seen": 11754576, "router_z_loss_mlp": 0.22937012, "step": 153, "time_per_iteration": 2.6255645751953125 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115876, "balance_loss_mlp": 1.09208155, "epoch": 0.029626779530588686, "flos": 603497640960.0, "grad_norm": 0.0750271830701194, "language_loss": 0.99508584, "learning_rate": 0.0009974447974719707, "loss": 1.00624466, "num_input_tokens_seen": 11831360, "router_z_loss_mlp": 0.23791504, "step": 154, "time_per_iteration": 2.685029983520508 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126502, "balance_loss_mlp": 1.10213518, "epoch": 0.02981916121585225, "flos": 621089897472.0, "grad_norm": 0.12681443605953674, "language_loss": 1.01620197, "learning_rate": 0.0009987265200589763, "loss": 1.02746701, "num_input_tokens_seen": 11902192, "router_z_loss_mlp": 0.24365234, "step": 155, "time_per_iteration": 2.7264955043792725 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119337, "balance_loss_mlp": 1.09590077, "epoch": 0.030011542901115813, "flos": 661633505280.0, "grad_norm": 0.07965097154096117, "language_loss": 1.01522899, "learning_rate": 0.001, "loss": 1.02642226, "num_input_tokens_seen": 11979088, "router_z_loss_mlp": 0.23400879, "step": 156, "time_per_iteration": 2.864698886871338 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111456, "balance_loss_mlp": 1.09257805, "epoch": 0.030203924586379376, "flos": 651569826816.0, "grad_norm": 0.061020534493473076, "language_loss": 0.9859184, "learning_rate": 0.0009999999029413921, "loss": 0.99706399, "num_input_tokens_seen": 12059200, "router_z_loss_mlp": 0.2199707, "step": 157, "time_per_iteration": 2.8241283893585205 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01125679, "balance_loss_mlp": 1.1049242, "epoch": 0.03039630627164294, "flos": 531354484224.0, "grad_norm": 0.05862251807890935, "language_loss": 1.00346851, "learning_rate": 0.0009999996117656068, "loss": 1.01472545, "num_input_tokens_seen": 12134944, "router_z_loss_mlp": 0.2076416, "step": 158, "time_per_iteration": 2.7097458839416504 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113624, "balance_loss_mlp": 1.09279847, "epoch": 0.030588687956906502, "flos": 586189509120.0, "grad_norm": 0.09545570145123992, "language_loss": 0.93653512, "learning_rate": 0.0009999991264727564, "loss": 0.94767129, "num_input_tokens_seen": 12207936, "router_z_loss_mlp": 0.20837402, "step": 159, "time_per_iteration": 2.756363868713379 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110635, "balance_loss_mlp": 1.08577418, "epoch": 0.030781069642170065, "flos": 513278042112.0, "grad_norm": 0.09475469160316574, "language_loss": 1.04571712, "learning_rate": 0.0009999984470630296, "loss": 1.05678058, "num_input_tokens_seen": 12273200, "router_z_loss_mlp": 0.20581055, "step": 160, "time_per_iteration": 2.5990707874298096 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112086, "balance_loss_mlp": 1.09061611, "epoch": 0.030973451327433628, "flos": 718123719168.0, "grad_norm": 0.07420241291943742, "language_loss": 0.9342289, "learning_rate": 0.0009999975735366902, "loss": 0.94534969, "num_input_tokens_seen": 12359600, "router_z_loss_mlp": 0.21472168, "step": 161, "time_per_iteration": 3.06878662109375 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01114184, "balance_loss_mlp": 1.09270215, "epoch": 0.03116583301269719, "flos": 1109771311104.0, "grad_norm": 0.0799449593456649, "language_loss": 0.95189524, "learning_rate": 0.0009999965058940775, "loss": 0.96303707, "num_input_tokens_seen": 12443936, "router_z_loss_mlp": 0.21484375, "step": 162, "time_per_iteration": 3.4937808513641357 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112457, "balance_loss_mlp": 1.10226631, "epoch": 0.031358214697960754, "flos": 450907098624.0, "grad_norm": 0.08293329451395655, "language_loss": 1.01278222, "learning_rate": 0.0009999952441356057, "loss": 1.02402782, "num_input_tokens_seen": 12507488, "router_z_loss_mlp": 0.22314453, "step": 163, "time_per_iteration": 2.535121202468872 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01109663, "balance_loss_mlp": 1.08820534, "epoch": 0.031550596383224314, "flos": 1255176870912.0, "grad_norm": 0.06727245316799851, "language_loss": 1.0154388, "learning_rate": 0.000999993788261765, "loss": 1.02653539, "num_input_tokens_seen": 12594096, "router_z_loss_mlp": 0.21472168, "step": 164, "time_per_iteration": 3.5832889080047607 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110987, "balance_loss_mlp": 1.08942175, "epoch": 0.03174297806848788, "flos": 668136812544.0, "grad_norm": 0.07205404441274409, "language_loss": 1.03110182, "learning_rate": 0.00099999213827312, "loss": 1.04221165, "num_input_tokens_seen": 12669424, "router_z_loss_mlp": 0.21569824, "step": 165, "time_per_iteration": 2.8096628189086914 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118839, "balance_loss_mlp": 1.09684491, "epoch": 0.03193535975375144, "flos": 551299032576.0, "grad_norm": 0.050309165813849886, "language_loss": 0.98088074, "learning_rate": 0.000999990294170312, "loss": 0.99206913, "num_input_tokens_seen": 12740080, "router_z_loss_mlp": 0.22009277, "step": 166, "time_per_iteration": 2.663135051727295 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01116813, "balance_loss_mlp": 1.09486628, "epoch": 0.032127741439015006, "flos": 543649314816.0, "grad_norm": 0.06058681172545402, "language_loss": 1.02190185, "learning_rate": 0.0009999882559540566, "loss": 1.03306985, "num_input_tokens_seen": 12810576, "router_z_loss_mlp": 0.21948242, "step": 167, "time_per_iteration": 2.649784564971924 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118672, "balance_loss_mlp": 1.09543872, "epoch": 0.032320123124278566, "flos": 548385831936.0, "grad_norm": 0.10019647540930027, "language_loss": 0.98887956, "learning_rate": 0.000999986023625145, "loss": 1.00006628, "num_input_tokens_seen": 12887904, "router_z_loss_mlp": 0.23217773, "step": 168, "time_per_iteration": 2.6998720169067383 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01817799, "balance_loss_mlp": 1.79767668, "epoch": 0.03251250480954213, "flos": 1305886551552.0, "grad_norm": 0.21411409700219255, "language_loss": 0.78924417, "learning_rate": 0.0009999835971844441, "loss": 0.80742216, "num_input_tokens_seen": 13107344, "router_z_loss_mlp": 0.20117188, "step": 169, "time_per_iteration": 5.029488563537598 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112576, "balance_loss_mlp": 1.10157228, "epoch": 0.03270488649480569, "flos": 561132914688.0, "grad_norm": 0.09130724925200479, "language_loss": 0.99515283, "learning_rate": 0.0009999809766328958, "loss": 1.00641036, "num_input_tokens_seen": 13175552, "router_z_loss_mlp": 0.24206543, "step": 170, "time_per_iteration": 2.6508679389953613 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01153612, "balance_loss_mlp": 1.12968671, "epoch": 0.03289726818006926, "flos": 482363112960.0, "grad_norm": 0.0981725040523357, "language_loss": 1.01766157, "learning_rate": 0.0009999781619715177, "loss": 1.02919769, "num_input_tokens_seen": 13242384, "router_z_loss_mlp": 0.23925781, "step": 171, "time_per_iteration": 2.5449466705322266 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01151071, "balance_loss_mlp": 1.12767053, "epoch": 0.03308964986533282, "flos": 674647460352.0, "grad_norm": 0.10018141203760955, "language_loss": 1.0104121, "learning_rate": 0.000999975153201402, "loss": 1.02192283, "num_input_tokens_seen": 13316160, "router_z_loss_mlp": 0.23388672, "step": 172, "time_per_iteration": 2.8463308811187744 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0114097, "balance_loss_mlp": 1.11745048, "epoch": 0.033282031550596385, "flos": 609217583616.0, "grad_norm": 0.05920698759335099, "language_loss": 0.98661143, "learning_rate": 0.0009999719503237174, "loss": 0.99802113, "num_input_tokens_seen": 13387664, "router_z_loss_mlp": 0.23498535, "step": 173, "time_per_iteration": 2.733147144317627 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01157549, "balance_loss_mlp": 1.1333611, "epoch": 0.033474413235859944, "flos": 468039762432.0, "grad_norm": 0.12686135486457134, "language_loss": 1.07479167, "learning_rate": 0.0009999685533397073, "loss": 1.08636713, "num_input_tokens_seen": 13454528, "router_z_loss_mlp": 0.24194336, "step": 174, "time_per_iteration": 2.5705809593200684 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110896, "balance_loss_mlp": 1.08707762, "epoch": 0.03366679492112351, "flos": 579634444800.0, "grad_norm": 0.07652801902249555, "language_loss": 0.99758261, "learning_rate": 0.00099996496225069, "loss": 1.00869155, "num_input_tokens_seen": 13522528, "router_z_loss_mlp": 0.23815918, "step": 175, "time_per_iteration": 2.6572659015655518 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118018, "balance_loss_mlp": 1.09399772, "epoch": 0.03385917660638707, "flos": 637678904832.0, "grad_norm": 0.05463854096335067, "language_loss": 1.01895058, "learning_rate": 0.0009999611770580604, "loss": 1.03013086, "num_input_tokens_seen": 13601120, "router_z_loss_mlp": 0.24023438, "step": 176, "time_per_iteration": 2.8216159343719482 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121175, "balance_loss_mlp": 1.09596181, "epoch": 0.03405155829165064, "flos": 441816933888.0, "grad_norm": 0.08810438351502946, "language_loss": 1.01167393, "learning_rate": 0.0009999571977632876, "loss": 1.02288568, "num_input_tokens_seen": 13666384, "router_z_loss_mlp": 0.25231934, "step": 177, "time_per_iteration": 2.581037998199463 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115073, "balance_loss_mlp": 1.09040904, "epoch": 0.034243939976914196, "flos": 466332443136.0, "grad_norm": 0.08419866181616258, "language_loss": 1.03353202, "learning_rate": 0.0009999530243679166, "loss": 1.04468274, "num_input_tokens_seen": 13733968, "router_z_loss_mlp": 0.24682617, "step": 178, "time_per_iteration": 2.5844500064849854 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01137225, "balance_loss_mlp": 1.11332321, "epoch": 0.03443632166217776, "flos": 779276671488.0, "grad_norm": 0.13671082465577608, "language_loss": 0.99045932, "learning_rate": 0.0009999486568735675, "loss": 1.00183165, "num_input_tokens_seen": 13818960, "router_z_loss_mlp": 0.23913574, "step": 179, "time_per_iteration": 3.044409990310669 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01125047, "balance_loss_mlp": 1.1010983, "epoch": 0.03462870334744132, "flos": 1263777707520.0, "grad_norm": 0.0738854697341979, "language_loss": 0.99422705, "learning_rate": 0.0009999440952819362, "loss": 1.00547755, "num_input_tokens_seen": 13912448, "router_z_loss_mlp": 0.23950195, "step": 180, "time_per_iteration": 3.644280433654785 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112251, "balance_loss_mlp": 1.08836114, "epoch": 0.03482108503270489, "flos": 607179151872.0, "grad_norm": 0.04789131390967285, "language_loss": 0.98983485, "learning_rate": 0.0009999393395947935, "loss": 1.00095737, "num_input_tokens_seen": 13990752, "router_z_loss_mlp": 0.2388916, "step": 181, "time_per_iteration": 2.8229053020477295 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01114504, "balance_loss_mlp": 1.08992302, "epoch": 0.03501346671796845, "flos": 538270396416.0, "grad_norm": 0.08040661288612141, "language_loss": 1.02358437, "learning_rate": 0.0009999343898139858, "loss": 1.03472936, "num_input_tokens_seen": 14058608, "router_z_loss_mlp": 0.24584961, "step": 182, "time_per_iteration": 2.6112709045410156 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01123737, "balance_loss_mlp": 1.09824967, "epoch": 0.035205848403232015, "flos": 518484063744.0, "grad_norm": 0.0879280890069936, "language_loss": 1.01010704, "learning_rate": 0.0009999292459414348, "loss": 1.02134442, "num_input_tokens_seen": 14126656, "router_z_loss_mlp": 0.25476074, "step": 183, "time_per_iteration": 2.574800491333008 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111495, "balance_loss_mlp": 1.08559036, "epoch": 0.035398230088495575, "flos": 472373586432.0, "grad_norm": 0.08068750200828848, "language_loss": 1.05455053, "learning_rate": 0.0009999239079791374, "loss": 1.06566548, "num_input_tokens_seen": 14195840, "router_z_loss_mlp": 0.25915527, "step": 184, "time_per_iteration": 2.5650548934936523 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110884, "balance_loss_mlp": 1.08343673, "epoch": 0.03559061177375914, "flos": 512074732032.0, "grad_norm": 0.07300059562366337, "language_loss": 0.98493111, "learning_rate": 0.0009999183759291659, "loss": 0.99601954, "num_input_tokens_seen": 14269936, "router_z_loss_mlp": 0.25427246, "step": 185, "time_per_iteration": 2.7383785247802734 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110571, "balance_loss_mlp": 1.08168936, "epoch": 0.0357829934590227, "flos": 477386887680.0, "grad_norm": 0.09426698036311254, "language_loss": 1.00536895, "learning_rate": 0.0009999126497936682, "loss": 1.01642609, "num_input_tokens_seen": 14334848, "router_z_loss_mlp": 0.24023438, "step": 186, "time_per_iteration": 2.5103538036346436 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110913, "balance_loss_mlp": 1.08740544, "epoch": 0.03597537514428627, "flos": 644656485888.0, "grad_norm": 0.07507023604654985, "language_loss": 1.03590488, "learning_rate": 0.0009999067295748676, "loss": 1.047014, "num_input_tokens_seen": 14407888, "router_z_loss_mlp": 0.23510742, "step": 187, "time_per_iteration": 2.806403160095215 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112247, "balance_loss_mlp": 1.09995186, "epoch": 0.03616775682954983, "flos": 581186119680.0, "grad_norm": 0.10679989437153373, "language_loss": 1.00781608, "learning_rate": 0.000999900615275062, "loss": 1.01904082, "num_input_tokens_seen": 14479072, "router_z_loss_mlp": 0.22509766, "step": 188, "time_per_iteration": 2.6750597953796387 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105556, "balance_loss_mlp": 1.0823226, "epoch": 0.03636013851481339, "flos": 382420859904.0, "grad_norm": 0.06425431277780277, "language_loss": 1.06987619, "learning_rate": 0.0009998943068966256, "loss": 1.0809319, "num_input_tokens_seen": 14540944, "router_z_loss_mlp": 0.23242188, "step": 189, "time_per_iteration": 2.4297006130218506 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106344, "balance_loss_mlp": 1.0826813, "epoch": 0.03655252020007695, "flos": 583224551424.0, "grad_norm": 0.07322572175010231, "language_loss": 1.01591444, "learning_rate": 0.0009998878044420072, "loss": 1.02697778, "num_input_tokens_seen": 14611392, "router_z_loss_mlp": 0.23669434, "step": 190, "time_per_iteration": 2.6686899662017822 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108272, "balance_loss_mlp": 1.08489525, "epoch": 0.03674490188534051, "flos": 471619957248.0, "grad_norm": 0.07088525550270033, "language_loss": 0.97819, "learning_rate": 0.0009998811079137318, "loss": 0.98927271, "num_input_tokens_seen": 14679776, "router_z_loss_mlp": 0.23400879, "step": 191, "time_per_iteration": 2.5795974731445312 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118931, "balance_loss_mlp": 1.09439743, "epoch": 0.03693728357060408, "flos": 528372274176.0, "grad_norm": 0.07437245365565072, "language_loss": 0.9895249, "learning_rate": 0.0009998742173143987, "loss": 1.0007143, "num_input_tokens_seen": 14749712, "router_z_loss_mlp": 0.24536133, "step": 192, "time_per_iteration": 2.6109251976013184 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01133748, "balance_loss_mlp": 1.10824919, "epoch": 0.03712966525586764, "flos": 798993994752.0, "grad_norm": 0.06698686336952825, "language_loss": 0.98415262, "learning_rate": 0.0009998671326466833, "loss": 0.99549013, "num_input_tokens_seen": 14827136, "router_z_loss_mlp": 0.25524902, "step": 193, "time_per_iteration": 2.955780506134033 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136952, "balance_loss_mlp": 1.10922432, "epoch": 0.037322046941131205, "flos": 829973164032.0, "grad_norm": 0.07154145387165563, "language_loss": 0.99267447, "learning_rate": 0.0009998598539133362, "loss": 1.00404394, "num_input_tokens_seen": 14902880, "router_z_loss_mlp": 0.27734375, "step": 194, "time_per_iteration": 3.0137686729431152 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01163557, "balance_loss_mlp": 1.13373041, "epoch": 0.037514428626394765, "flos": 437685742080.0, "grad_norm": 0.09795763902625766, "language_loss": 1.00780571, "learning_rate": 0.0009998523811171828, "loss": 1.01944125, "num_input_tokens_seen": 14967264, "router_z_loss_mlp": 0.2980957, "step": 195, "time_per_iteration": 2.5090267658233643 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01164155, "balance_loss_mlp": 1.13323212, "epoch": 0.03770681031165833, "flos": 511625051136.0, "grad_norm": 0.0756543485462421, "language_loss": 1.0036695, "learning_rate": 0.0009998447142611248, "loss": 1.015311, "num_input_tokens_seen": 15039104, "router_z_loss_mlp": 0.30883789, "step": 196, "time_per_iteration": 2.653759241104126 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01156318, "balance_loss_mlp": 1.12615836, "epoch": 0.03789919199692189, "flos": 807449098752.0, "grad_norm": 0.10738469994654526, "language_loss": 0.9438082, "learning_rate": 0.0009998368533481387, "loss": 0.95537138, "num_input_tokens_seen": 15124864, "router_z_loss_mlp": 0.30126953, "step": 197, "time_per_iteration": 3.03090763092041 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01123277, "balance_loss_mlp": 1.09433353, "epoch": 0.03809157368218546, "flos": 690576814080.0, "grad_norm": 0.08947148055588174, "language_loss": 0.97516447, "learning_rate": 0.0009998287983812762, "loss": 0.98639727, "num_input_tokens_seen": 15199680, "router_z_loss_mlp": 0.28930664, "step": 198, "time_per_iteration": 2.842519760131836 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01133095, "balance_loss_mlp": 1.10672641, "epoch": 0.03828395536744902, "flos": 517940407296.0, "grad_norm": 0.08719552456544254, "language_loss": 1.03183711, "learning_rate": 0.0009998205493636646, "loss": 1.04316807, "num_input_tokens_seen": 15270176, "router_z_loss_mlp": 0.26416016, "step": 199, "time_per_iteration": 2.657094955444336 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099837, "balance_loss_mlp": 1.07485092, "epoch": 0.038476337052712584, "flos": 581662964736.0, "grad_norm": 0.11937452390124363, "language_loss": 0.95869702, "learning_rate": 0.0009998121062985063, "loss": 0.96969533, "num_input_tokens_seen": 15343168, "router_z_loss_mlp": 0.24987793, "step": 200, "time_per_iteration": 2.6954355239868164 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108694, "balance_loss_mlp": 1.08444691, "epoch": 0.03866871873797614, "flos": 577086861312.0, "grad_norm": 0.09459530753006626, "language_loss": 0.98493665, "learning_rate": 0.0009998034691890794, "loss": 0.9960236, "num_input_tokens_seen": 15417328, "router_z_loss_mlp": 0.24243164, "step": 201, "time_per_iteration": 2.7717928886413574 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01104411, "balance_loss_mlp": 1.08075976, "epoch": 0.03886110042323971, "flos": 540731344896.0, "grad_norm": 0.07675440437740683, "language_loss": 1.0290482, "learning_rate": 0.0009997946380387369, "loss": 1.04009235, "num_input_tokens_seen": 15489488, "router_z_loss_mlp": 0.23632812, "step": 202, "time_per_iteration": 2.63975191116333 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111336, "balance_loss_mlp": 1.08706474, "epoch": 0.03905348210850327, "flos": 718002952704.0, "grad_norm": 0.09220046036918417, "language_loss": 1.04956245, "learning_rate": 0.0009997856128509076, "loss": 1.06067586, "num_input_tokens_seen": 15558944, "router_z_loss_mlp": 0.24279785, "step": 203, "time_per_iteration": 2.856816053390503 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124883, "balance_loss_mlp": 1.10112453, "epoch": 0.039245863793766836, "flos": 427493583360.0, "grad_norm": 0.08622839045605694, "language_loss": 0.99688643, "learning_rate": 0.0009997763936290952, "loss": 1.00813532, "num_input_tokens_seen": 15625024, "router_z_loss_mlp": 0.23754883, "step": 204, "time_per_iteration": 2.5392112731933594 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0113243, "balance_loss_mlp": 1.10773039, "epoch": 0.039438245479030395, "flos": 663096347136.0, "grad_norm": 0.09842935942049862, "language_loss": 1.0453217, "learning_rate": 0.0009997669803768789, "loss": 1.05664587, "num_input_tokens_seen": 15697120, "router_z_loss_mlp": 0.24694824, "step": 205, "time_per_iteration": 2.7708992958068848 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108465, "balance_loss_mlp": 1.08426595, "epoch": 0.03963062716429396, "flos": 635349007872.0, "grad_norm": 0.10843184908981528, "language_loss": 0.9984858, "learning_rate": 0.0009997573730979134, "loss": 1.00957048, "num_input_tokens_seen": 15768752, "router_z_loss_mlp": 0.24194336, "step": 206, "time_per_iteration": 2.7474939823150635 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01685643, "balance_loss_mlp": 1.6616106, "epoch": 0.03982300884955752, "flos": 1418565975552.0, "grad_norm": 0.13014896830523812, "language_loss": 0.79193199, "learning_rate": 0.0009997475717959284, "loss": 0.80878842, "num_input_tokens_seen": 15980624, "router_z_loss_mlp": 0.24023438, "step": 207, "time_per_iteration": 4.682751655578613 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01109669, "balance_loss_mlp": 1.08474243, "epoch": 0.04001539053482109, "flos": 689118741504.0, "grad_norm": 0.07677308889428856, "language_loss": 0.98866731, "learning_rate": 0.0009997375764747294, "loss": 0.99976397, "num_input_tokens_seen": 16067232, "router_z_loss_mlp": 0.24926758, "step": 208, "time_per_iteration": 2.9866418838500977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110763, "balance_loss_mlp": 1.08659935, "epoch": 0.04020777222008465, "flos": 533639964672.0, "grad_norm": 0.07362493409063897, "language_loss": 0.96845645, "learning_rate": 0.0009997273871381967, "loss": 0.97956407, "num_input_tokens_seen": 16139808, "router_z_loss_mlp": 0.24169922, "step": 209, "time_per_iteration": 2.7354848384857178 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01125333, "balance_loss_mlp": 1.09998906, "epoch": 0.040400153905348214, "flos": 567927687168.0, "grad_norm": 0.07873798613461079, "language_loss": 1.01664305, "learning_rate": 0.0009997170037902862, "loss": 1.0278964, "num_input_tokens_seen": 16210848, "router_z_loss_mlp": 0.25366211, "step": 210, "time_per_iteration": 2.704061269760132 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01120611, "balance_loss_mlp": 1.09462297, "epoch": 0.040592535590611774, "flos": 713439332352.0, "grad_norm": 0.06515356853390573, "language_loss": 1.04550838, "learning_rate": 0.0009997064264350292, "loss": 1.05671442, "num_input_tokens_seen": 16283984, "router_z_loss_mlp": 0.26013184, "step": 211, "time_per_iteration": 2.8975577354431152 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113542, "balance_loss_mlp": 1.08662462, "epoch": 0.04078491727587533, "flos": 578100022272.0, "grad_norm": 0.07652094351016743, "language_loss": 0.98263478, "learning_rate": 0.0009996956550765317, "loss": 0.99377024, "num_input_tokens_seen": 16353904, "router_z_loss_mlp": 0.26928711, "step": 212, "time_per_iteration": 2.6716954708099365 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01125588, "balance_loss_mlp": 1.09752572, "epoch": 0.0409772989611389, "flos": 552299710464.0, "grad_norm": 0.07289633346919515, "language_loss": 0.93075061, "learning_rate": 0.0009996846897189762, "loss": 0.94200653, "num_input_tokens_seen": 16425488, "router_z_loss_mlp": 0.28051758, "step": 213, "time_per_iteration": 2.621661901473999 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110085, "balance_loss_mlp": 1.08412087, "epoch": 0.04116968064640246, "flos": 555630285312.0, "grad_norm": 0.055838089119108855, "language_loss": 0.99370623, "learning_rate": 0.0009996735303666193, "loss": 1.004807, "num_input_tokens_seen": 16498016, "router_z_loss_mlp": 0.2598877, "step": 214, "time_per_iteration": 2.6928601264953613 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095032, "balance_loss_mlp": 1.06966448, "epoch": 0.041362062331666026, "flos": 578492803584.0, "grad_norm": 0.04962656356162825, "language_loss": 1.01034558, "learning_rate": 0.0009996621770237937, "loss": 1.02129602, "num_input_tokens_seen": 16573744, "router_z_loss_mlp": 0.25390625, "step": 215, "time_per_iteration": 2.760256290435791 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098352, "balance_loss_mlp": 1.07167339, "epoch": 0.041554444016929586, "flos": 611443593216.0, "grad_norm": 0.06820201547086252, "language_loss": 0.97216904, "learning_rate": 0.0009996506296949073, "loss": 0.98315251, "num_input_tokens_seen": 16655344, "router_z_loss_mlp": 0.26708984, "step": 216, "time_per_iteration": 2.921712636947632 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106582, "balance_loss_mlp": 1.0792954, "epoch": 0.04174682570219315, "flos": 528115313664.0, "grad_norm": 0.05678696526689756, "language_loss": 0.96681535, "learning_rate": 0.0009996388883844428, "loss": 0.97788119, "num_input_tokens_seen": 16726480, "router_z_loss_mlp": 0.27282715, "step": 217, "time_per_iteration": 2.6392288208007812 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092837, "balance_loss_mlp": 1.06704009, "epoch": 0.04193920738745671, "flos": 511506482688.0, "grad_norm": 0.06325985488704432, "language_loss": 1.01514912, "learning_rate": 0.0009996269530969588, "loss": 1.02607751, "num_input_tokens_seen": 16792112, "router_z_loss_mlp": 0.25830078, "step": 218, "time_per_iteration": 2.6588566303253174 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105308, "balance_loss_mlp": 1.08038127, "epoch": 0.04213158907272028, "flos": 571490629632.0, "grad_norm": 0.07879458740668356, "language_loss": 0.99769139, "learning_rate": 0.0009996148238370888, "loss": 1.00874448, "num_input_tokens_seen": 16862960, "router_z_loss_mlp": 0.24938965, "step": 219, "time_per_iteration": 2.7322278022766113 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103161, "balance_loss_mlp": 1.07711363, "epoch": 0.04232397075798384, "flos": 964222589952.0, "grad_norm": 0.0629407592127239, "language_loss": 0.95434463, "learning_rate": 0.0009996025006095421, "loss": 0.96537632, "num_input_tokens_seen": 16950416, "router_z_loss_mlp": 0.26049805, "step": 220, "time_per_iteration": 3.336355209350586 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02460831, "balance_loss_mlp": 2.43965983, "epoch": 0.042516352443247404, "flos": 1469595778560.0, "grad_norm": 0.4526401201513886, "language_loss": 0.77783144, "learning_rate": 0.0009995899834191028, "loss": 0.80243975, "num_input_tokens_seen": 17180944, "router_z_loss_mlp": 0.21191406, "step": 221, "time_per_iteration": 5.584397315979004 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138695, "balance_loss_mlp": 1.11146736, "epoch": 0.042708734128510964, "flos": 654712823808.0, "grad_norm": 0.08000509590360377, "language_loss": 0.96767551, "learning_rate": 0.0009995772722706307, "loss": 0.9790625, "num_input_tokens_seen": 17257792, "router_z_loss_mlp": 0.27246094, "step": 222, "time_per_iteration": 2.932035207748413 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01177646, "balance_loss_mlp": 1.14898777, "epoch": 0.04290111581377453, "flos": 431827407360.0, "grad_norm": 0.06295735346771135, "language_loss": 1.10290885, "learning_rate": 0.0009995643671690604, "loss": 1.1146853, "num_input_tokens_seen": 17320288, "router_z_loss_mlp": 0.28686523, "step": 223, "time_per_iteration": 2.489574909210205 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0118606, "balance_loss_mlp": 1.15768862, "epoch": 0.04309349749903809, "flos": 644676309504.0, "grad_norm": 0.06397701682602697, "language_loss": 0.97599596, "learning_rate": 0.0009995512681194023, "loss": 0.98785651, "num_input_tokens_seen": 17396672, "router_z_loss_mlp": 0.28369141, "step": 224, "time_per_iteration": 2.8617055416107178 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01204697, "balance_loss_mlp": 1.17644429, "epoch": 0.04328587918430166, "flos": 831267505152.0, "grad_norm": 0.0569906191636753, "language_loss": 0.95713508, "learning_rate": 0.0009995379751267417, "loss": 0.96918201, "num_input_tokens_seen": 17488096, "router_z_loss_mlp": 0.28295898, "step": 225, "time_per_iteration": 3.272956371307373 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01211045, "balance_loss_mlp": 1.17959809, "epoch": 0.043478260869565216, "flos": 525066292224.0, "grad_norm": 0.06210348551978246, "language_loss": 0.970909, "learning_rate": 0.0009995244881962398, "loss": 0.98301941, "num_input_tokens_seen": 17557632, "router_z_loss_mlp": 0.31420898, "step": 226, "time_per_iteration": 2.629014253616333 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01207143, "balance_loss_mlp": 1.17750776, "epoch": 0.04367064255482878, "flos": 439484465664.0, "grad_norm": 0.06412842399528458, "language_loss": 0.97423029, "learning_rate": 0.0009995108073331323, "loss": 0.98630178, "num_input_tokens_seen": 17626672, "router_z_loss_mlp": 0.29614258, "step": 227, "time_per_iteration": 2.598266124725342 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01209228, "balance_loss_mlp": 1.1790204, "epoch": 0.04386302424009234, "flos": 507380060160.0, "grad_norm": 0.05900157234221112, "language_loss": 1.00919747, "learning_rate": 0.0009994969325427309, "loss": 1.02128983, "num_input_tokens_seen": 17698624, "router_z_loss_mlp": 0.30200195, "step": 228, "time_per_iteration": 2.681445598602295 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01208149, "balance_loss_mlp": 1.17727375, "epoch": 0.04405540592535591, "flos": 540694268928.0, "grad_norm": 0.08372721248844238, "language_loss": 0.96768719, "learning_rate": 0.0009994828638304218, "loss": 0.97976863, "num_input_tokens_seen": 17767760, "router_z_loss_mlp": 0.30883789, "step": 229, "time_per_iteration": 2.6330137252807617 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01213023, "balance_loss_mlp": 1.18202829, "epoch": 0.04424778761061947, "flos": 446370642432.0, "grad_norm": 0.09332052147555223, "language_loss": 1.02555704, "learning_rate": 0.0009994686012016675, "loss": 1.0376873, "num_input_tokens_seen": 17833664, "router_z_loss_mlp": 0.30981445, "step": 230, "time_per_iteration": 2.519575595855713 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01205079, "balance_loss_mlp": 1.17470419, "epoch": 0.044440169295883035, "flos": 700702161408.0, "grad_norm": 0.07303811655625075, "language_loss": 1.02279592, "learning_rate": 0.000999454144662005, "loss": 1.03484678, "num_input_tokens_seen": 17908880, "router_z_loss_mlp": 0.3034668, "step": 231, "time_per_iteration": 2.8772194385528564 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01200788, "balance_loss_mlp": 1.16729009, "epoch": 0.044632550981146595, "flos": 588329256960.0, "grad_norm": 0.05982585511102693, "language_loss": 0.9550131, "learning_rate": 0.0009994394942170468, "loss": 0.96702093, "num_input_tokens_seen": 17978208, "router_z_loss_mlp": 0.33520508, "step": 232, "time_per_iteration": 2.705536127090454 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01200355, "balance_loss_mlp": 1.16673827, "epoch": 0.04482493266641016, "flos": 554797734912.0, "grad_norm": 0.06482734437318205, "language_loss": 0.93872058, "learning_rate": 0.0009994246498724808, "loss": 0.95072412, "num_input_tokens_seen": 18049296, "router_z_loss_mlp": 0.33642578, "step": 233, "time_per_iteration": 2.729526996612549 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01204357, "balance_loss_mlp": 1.17043054, "epoch": 0.04501731435167372, "flos": 722813621760.0, "grad_norm": 0.06840473363398163, "language_loss": 0.96267349, "learning_rate": 0.00099940961163407, "loss": 0.97471702, "num_input_tokens_seen": 18123296, "router_z_loss_mlp": 0.33935547, "step": 234, "time_per_iteration": 2.8506321907043457 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01210646, "balance_loss_mlp": 1.1758604, "epoch": 0.04520969603693728, "flos": 511790607360.0, "grad_norm": 0.061734633326469966, "language_loss": 0.99016106, "learning_rate": 0.0009993943795076528, "loss": 1.0022676, "num_input_tokens_seen": 18192784, "router_z_loss_mlp": 0.34814453, "step": 235, "time_per_iteration": 2.6817193031311035 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.012082, "balance_loss_mlp": 1.17379582, "epoch": 0.04540207772220085, "flos": 365058399744.0, "grad_norm": 0.07722659013027651, "language_loss": 1.01211047, "learning_rate": 0.0009993789534991427, "loss": 1.02419257, "num_input_tokens_seen": 18254064, "router_z_loss_mlp": 0.34423828, "step": 236, "time_per_iteration": 2.4797797203063965 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01216471, "balance_loss_mlp": 1.18354487, "epoch": 0.045594459407464406, "flos": 522669583872.0, "grad_norm": 0.057771959372629855, "language_loss": 0.96296465, "learning_rate": 0.0009993633336145287, "loss": 0.97512937, "num_input_tokens_seen": 18325728, "router_z_loss_mlp": 0.3293457, "step": 237, "time_per_iteration": 2.629390001296997 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01225643, "balance_loss_mlp": 1.19369495, "epoch": 0.04578684109272797, "flos": 671776104960.0, "grad_norm": 0.07668042159358972, "language_loss": 1.00654197, "learning_rate": 0.0009993475198598752, "loss": 1.01879823, "num_input_tokens_seen": 18408608, "router_z_loss_mlp": 0.31958008, "step": 238, "time_per_iteration": 3.01481032371521 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01220207, "balance_loss_mlp": 1.1866858, "epoch": 0.04597922277799153, "flos": 541633277952.0, "grad_norm": 0.08994725037560618, "language_loss": 0.96828419, "learning_rate": 0.0009993315122413212, "loss": 0.98048627, "num_input_tokens_seen": 18471920, "router_z_loss_mlp": 0.33544922, "step": 239, "time_per_iteration": 2.6483867168426514 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01215592, "balance_loss_mlp": 1.18042517, "epoch": 0.0461716044632551, "flos": 458984102400.0, "grad_norm": 0.08238446857980607, "language_loss": 0.9678297, "learning_rate": 0.0009993153107650818, "loss": 0.97998565, "num_input_tokens_seen": 18540496, "router_z_loss_mlp": 0.35180664, "step": 240, "time_per_iteration": 2.594534158706665 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01199347, "balance_loss_mlp": 1.16303563, "epoch": 0.04636398614851866, "flos": 455240922624.0, "grad_norm": 0.09316981102360596, "language_loss": 0.96465278, "learning_rate": 0.0009992989154374468, "loss": 0.9766463, "num_input_tokens_seen": 18606944, "router_z_loss_mlp": 0.36328125, "step": 241, "time_per_iteration": 2.5503900051116943 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01190623, "balance_loss_mlp": 1.15631413, "epoch": 0.046556367833782225, "flos": 556826254848.0, "grad_norm": 0.06540072726643342, "language_loss": 1.03219867, "learning_rate": 0.0009992823262647817, "loss": 1.04410505, "num_input_tokens_seen": 18679520, "router_z_loss_mlp": 0.34301758, "step": 242, "time_per_iteration": 2.7218894958496094 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01156597, "balance_loss_mlp": 1.1235044, "epoch": 0.046748749519045785, "flos": 592917470208.0, "grad_norm": 0.09177405734811558, "language_loss": 0.97326249, "learning_rate": 0.0009992655432535264, "loss": 0.98482847, "num_input_tokens_seen": 18756656, "router_z_loss_mlp": 0.33105469, "step": 243, "time_per_iteration": 2.800133466720581 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136682, "balance_loss_mlp": 1.10614085, "epoch": 0.04694113120430935, "flos": 569864802816.0, "grad_norm": 0.0753000751829641, "language_loss": 0.98140877, "learning_rate": 0.0009992485664101973, "loss": 0.99277562, "num_input_tokens_seen": 18829792, "router_z_loss_mlp": 0.30517578, "step": 244, "time_per_iteration": 2.6863763332366943 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115952, "balance_loss_mlp": 1.08648348, "epoch": 0.04713351288957291, "flos": 863768987136.0, "grad_norm": 0.06369495608278983, "language_loss": 1.00049853, "learning_rate": 0.000999231395741385, "loss": 1.01165819, "num_input_tokens_seen": 18906864, "router_z_loss_mlp": 0.29467773, "step": 245, "time_per_iteration": 3.145612955093384 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01104415, "balance_loss_mlp": 1.0764488, "epoch": 0.04732589457483648, "flos": 537215390208.0, "grad_norm": 0.058358007346171054, "language_loss": 0.97651666, "learning_rate": 0.0009992140312537557, "loss": 0.98756075, "num_input_tokens_seen": 18973632, "router_z_loss_mlp": 0.2800293, "step": 246, "time_per_iteration": 2.612847328186035 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092763, "balance_loss_mlp": 1.06641817, "epoch": 0.04751827626010004, "flos": 761906870784.0, "grad_norm": 0.0813165094086701, "language_loss": 0.93562448, "learning_rate": 0.000999196472954051, "loss": 0.94655204, "num_input_tokens_seen": 19052944, "router_z_loss_mlp": 0.26379395, "step": 247, "time_per_iteration": 2.9633545875549316 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02706023, "balance_loss_mlp": 2.55038333, "epoch": 0.0477106579453636, "flos": 1579791859200.0, "grad_norm": 0.26644214904670055, "language_loss": 0.79424852, "learning_rate": 0.0009991787208490878, "loss": 0.82130873, "num_input_tokens_seen": 19286288, "router_z_loss_mlp": 1.5546875, "step": 248, "time_per_iteration": 5.665804624557495 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01151493, "balance_loss_mlp": 1.12381256, "epoch": 0.04790303963062716, "flos": 457766111232.0, "grad_norm": 0.07780849766073628, "language_loss": 1.00670481, "learning_rate": 0.0009991607749457578, "loss": 1.01821971, "num_input_tokens_seen": 19349296, "router_z_loss_mlp": 0.27709961, "step": 249, "time_per_iteration": 2.511357069015503 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01173992, "balance_loss_mlp": 1.14483345, "epoch": 0.04809542131589073, "flos": 782419668480.0, "grad_norm": 0.08242230719461915, "language_loss": 0.98555326, "learning_rate": 0.0009991426352510286, "loss": 0.99729323, "num_input_tokens_seen": 19428416, "router_z_loss_mlp": 0.29174805, "step": 250, "time_per_iteration": 2.9747626781463623 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01213643, "balance_loss_mlp": 1.18186164, "epoch": 0.04828780300115429, "flos": 559260039168.0, "grad_norm": 0.08110439009499554, "language_loss": 0.99640858, "learning_rate": 0.0009991243017719422, "loss": 1.00854492, "num_input_tokens_seen": 19498688, "router_z_loss_mlp": 0.31787109, "step": 251, "time_per_iteration": 2.6450002193450928 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01247147, "balance_loss_mlp": 1.21276748, "epoch": 0.048480184686417856, "flos": 501929561088.0, "grad_norm": 0.09531666026222298, "language_loss": 0.94547766, "learning_rate": 0.0009991057745156165, "loss": 0.95794916, "num_input_tokens_seen": 19567568, "router_z_loss_mlp": 0.34375, "step": 252, "time_per_iteration": 2.608226776123047 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0212821, "balance_loss_mlp": 2.05687547, "epoch": 0.048672566371681415, "flos": 1536360016896.0, "grad_norm": 0.23568337742673945, "language_loss": 0.81910986, "learning_rate": 0.0009990870534892446, "loss": 0.84039193, "num_input_tokens_seen": 19796368, "router_z_loss_mlp": 0.71484375, "step": 253, "time_per_iteration": 5.009166955947876 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01253718, "balance_loss_mlp": 1.22112656, "epoch": 0.04886494805694498, "flos": 537922031616.0, "grad_norm": 0.11732554794190522, "language_loss": 1.02719152, "learning_rate": 0.0009990681387000943, "loss": 1.03972876, "num_input_tokens_seen": 19870480, "router_z_loss_mlp": 0.32568359, "step": 254, "time_per_iteration": 2.733544111251831 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01259536, "balance_loss_mlp": 1.22959042, "epoch": 0.04905732974220854, "flos": 680169540096.0, "grad_norm": 0.10757948615664437, "language_loss": 0.99075437, "learning_rate": 0.0009990490301555093, "loss": 1.00334978, "num_input_tokens_seen": 19956288, "router_z_loss_mlp": 0.29907227, "step": 255, "time_per_iteration": 2.952223777770996 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01833791, "balance_loss_mlp": 1.79201972, "epoch": 0.04924971142747211, "flos": 1421179997184.0, "grad_norm": 0.13001926806611183, "language_loss": 0.79215157, "learning_rate": 0.0009990297278629078, "loss": 0.81048942, "num_input_tokens_seen": 20180080, "router_z_loss_mlp": 0.41796875, "step": 256, "time_per_iteration": 4.834028244018555 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01839647, "balance_loss_mlp": 1.7994014, "epoch": 0.04944209311273567, "flos": 1558006742016.0, "grad_norm": 0.11989001468728706, "language_loss": 0.79242742, "learning_rate": 0.000999010231829784, "loss": 0.81082386, "num_input_tokens_seen": 20413456, "router_z_loss_mlp": 0.40234375, "step": 257, "time_per_iteration": 4.963416814804077 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01764173, "balance_loss_mlp": 1.72659838, "epoch": 0.04963447479799923, "flos": 1570820262912.0, "grad_norm": 0.09913369297847359, "language_loss": 0.69975883, "learning_rate": 0.0009989905420637066, "loss": 0.71740055, "num_input_tokens_seen": 20644736, "router_z_loss_mlp": 0.375, "step": 258, "time_per_iteration": 4.860485076904297 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01242536, "balance_loss_mlp": 1.21342516, "epoch": 0.049826856483262794, "flos": 625349569536.0, "grad_norm": 0.09740558448014502, "language_loss": 0.93272007, "learning_rate": 0.0009989706585723202, "loss": 0.94514549, "num_input_tokens_seen": 20719040, "router_z_loss_mlp": 0.29101562, "step": 259, "time_per_iteration": 2.763617753982544 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01252163, "balance_loss_mlp": 1.22202659, "epoch": 0.05001923816852635, "flos": 504160713216.0, "grad_norm": 0.1249592106702951, "language_loss": 0.99313855, "learning_rate": 0.0009989505813633442, "loss": 1.0056603, "num_input_tokens_seen": 20789376, "router_z_loss_mlp": 0.30102539, "step": 260, "time_per_iteration": 2.687018394470215 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01240716, "balance_loss_mlp": 1.2099601, "epoch": 0.05021161985378992, "flos": 587345831424.0, "grad_norm": 0.12109163963871895, "language_loss": 0.99271172, "learning_rate": 0.000998930310444573, "loss": 1.00511885, "num_input_tokens_seen": 20857856, "router_z_loss_mlp": 0.30712891, "step": 261, "time_per_iteration": 2.7355992794036865 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01194626, "balance_loss_mlp": 1.16220057, "epoch": 0.05040400153905348, "flos": 633303235584.0, "grad_norm": 0.10196827835843725, "language_loss": 0.96712077, "learning_rate": 0.0009989098458238765, "loss": 0.97906703, "num_input_tokens_seen": 20931232, "router_z_loss_mlp": 0.32421875, "step": 262, "time_per_iteration": 2.8160154819488525 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0120265, "balance_loss_mlp": 1.16850853, "epoch": 0.050596383224317046, "flos": 553636270080.0, "grad_norm": 0.08050125519090791, "language_loss": 0.96376812, "learning_rate": 0.0009988891875091998, "loss": 0.97579467, "num_input_tokens_seen": 21012672, "router_z_loss_mlp": 0.34179688, "step": 263, "time_per_iteration": 2.7738425731658936 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01221172, "balance_loss_mlp": 1.18657792, "epoch": 0.050788764909580605, "flos": 549663293952.0, "grad_norm": 0.09840792148235085, "language_loss": 0.91716301, "learning_rate": 0.0009988683355085636, "loss": 0.92937469, "num_input_tokens_seen": 21088592, "router_z_loss_mlp": 0.34619141, "step": 264, "time_per_iteration": 2.7763147354125977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01240941, "balance_loss_mlp": 1.20393836, "epoch": 0.05098114659484417, "flos": 605118325248.0, "grad_norm": 0.10851467261948886, "language_loss": 0.99809039, "learning_rate": 0.000998847289830063, "loss": 1.01049972, "num_input_tokens_seen": 21169840, "router_z_loss_mlp": 0.37011719, "step": 265, "time_per_iteration": 2.824655532836914 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01228337, "balance_loss_mlp": 1.1930747, "epoch": 0.05117352828010773, "flos": 438548027904.0, "grad_norm": 0.10300549526892724, "language_loss": 0.92410266, "learning_rate": 0.0009988260504818682, "loss": 0.93638599, "num_input_tokens_seen": 21236144, "router_z_loss_mlp": 0.35253906, "step": 266, "time_per_iteration": 2.5484864711761475 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01187227, "balance_loss_mlp": 1.15127397, "epoch": 0.0513659099653713, "flos": 505032910848.0, "grad_norm": 0.08304900792028935, "language_loss": 0.99349552, "learning_rate": 0.000998804617472226, "loss": 1.00536776, "num_input_tokens_seen": 21304864, "router_z_loss_mlp": 0.35986328, "step": 267, "time_per_iteration": 2.67124342918396 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0115754, "balance_loss_mlp": 1.1241138, "epoch": 0.05155829165063486, "flos": 695488799232.0, "grad_norm": 0.09977621520267708, "language_loss": 0.94207335, "learning_rate": 0.0009987829908094568, "loss": 0.95364869, "num_input_tokens_seen": 21377504, "router_z_loss_mlp": 0.33447266, "step": 268, "time_per_iteration": 2.813934087753296 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01134088, "balance_loss_mlp": 1.09908843, "epoch": 0.051750673335898424, "flos": 1348260111360.0, "grad_norm": 0.11738978381138881, "language_loss": 1.00792646, "learning_rate": 0.0009987611705019569, "loss": 1.01926744, "num_input_tokens_seen": 21463840, "router_z_loss_mlp": 0.3503418, "step": 269, "time_per_iteration": 4.138862133026123 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117111, "balance_loss_mlp": 1.08282614, "epoch": 0.051943055021161984, "flos": 489607566336.0, "grad_norm": 0.05348082980263852, "language_loss": 0.99369657, "learning_rate": 0.0009987391565581978, "loss": 1.00486767, "num_input_tokens_seen": 21531184, "router_z_loss_mlp": 0.34277344, "step": 270, "time_per_iteration": 2.586071014404297 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126117, "balance_loss_mlp": 1.09176075, "epoch": 0.05213543670642555, "flos": 545779150848.0, "grad_norm": 0.07524916084480812, "language_loss": 0.92056942, "learning_rate": 0.000998716948986726, "loss": 0.93183053, "num_input_tokens_seen": 21612224, "router_z_loss_mlp": 0.34350586, "step": 271, "time_per_iteration": 2.7993569374084473 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01142479, "balance_loss_mlp": 1.10948217, "epoch": 0.05232781839168911, "flos": 603561881088.0, "grad_norm": 0.0817059207133684, "language_loss": 0.94050443, "learning_rate": 0.0009986945477961633, "loss": 0.95192927, "num_input_tokens_seen": 21681024, "router_z_loss_mlp": 0.33032227, "step": 272, "time_per_iteration": 2.692488193511963 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01162369, "balance_loss_mlp": 1.13108802, "epoch": 0.052520200076952676, "flos": 538504962048.0, "grad_norm": 0.07154102990319093, "language_loss": 0.9958387, "learning_rate": 0.0009986719529952066, "loss": 1.00746238, "num_input_tokens_seen": 21761616, "router_z_loss_mlp": 0.3125, "step": 273, "time_per_iteration": 2.834634780883789 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01151893, "balance_loss_mlp": 1.12099373, "epoch": 0.052712581762216236, "flos": 463384737792.0, "grad_norm": 0.11641144040169231, "language_loss": 0.98596179, "learning_rate": 0.000998649164592628, "loss": 0.99748075, "num_input_tokens_seen": 21828416, "router_z_loss_mlp": 0.30859375, "step": 274, "time_per_iteration": 2.570082902908325 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128229, "balance_loss_mlp": 1.0986656, "epoch": 0.0529049634474798, "flos": 548020214784.0, "grad_norm": 0.08444223005841496, "language_loss": 0.96863008, "learning_rate": 0.0009986261825972748, "loss": 0.97991234, "num_input_tokens_seen": 21901600, "router_z_loss_mlp": 0.29541016, "step": 275, "time_per_iteration": 2.66398549079895 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01116765, "balance_loss_mlp": 1.08734369, "epoch": 0.05309734513274336, "flos": 618021052416.0, "grad_norm": 0.09541227165854013, "language_loss": 0.9859423, "learning_rate": 0.000998603007018069, "loss": 0.99711001, "num_input_tokens_seen": 21979312, "router_z_loss_mlp": 0.29394531, "step": 276, "time_per_iteration": 2.7675342559814453 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108591, "balance_loss_mlp": 1.07731009, "epoch": 0.05328972681800693, "flos": 605498996736.0, "grad_norm": 0.06559506468622318, "language_loss": 0.95903766, "learning_rate": 0.0009985796378640089, "loss": 0.97012359, "num_input_tokens_seen": 22053776, "router_z_loss_mlp": 0.3125, "step": 277, "time_per_iteration": 2.7019519805908203 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111687, "balance_loss_mlp": 1.08012068, "epoch": 0.05348210850327049, "flos": 604503088128.0, "grad_norm": 0.07318038514420845, "language_loss": 0.95983016, "learning_rate": 0.0009985560751441665, "loss": 0.97094703, "num_input_tokens_seen": 22134304, "router_z_loss_mlp": 0.31542969, "step": 278, "time_per_iteration": 2.8234922885894775 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111914, "balance_loss_mlp": 1.0874306, "epoch": 0.053674490188534055, "flos": 630782816256.0, "grad_norm": 0.07220087085065136, "language_loss": 0.98319995, "learning_rate": 0.00099853231886769, "loss": 0.99439132, "num_input_tokens_seen": 22212896, "router_z_loss_mlp": 0.31713867, "step": 279, "time_per_iteration": 2.7748613357543945 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01133292, "balance_loss_mlp": 1.10162961, "epoch": 0.053866871873797614, "flos": 479185611264.0, "grad_norm": 0.06439402113592181, "language_loss": 0.98657203, "learning_rate": 0.0009985083690438024, "loss": 0.99790496, "num_input_tokens_seen": 22287216, "router_z_loss_mlp": 0.31640625, "step": 280, "time_per_iteration": 2.700810670852661 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01132869, "balance_loss_mlp": 1.10204113, "epoch": 0.054059253559061174, "flos": 788035723776.0, "grad_norm": 0.04843472954862069, "language_loss": 0.89283121, "learning_rate": 0.0009984842256818016, "loss": 0.9041599, "num_input_tokens_seen": 22370864, "router_z_loss_mlp": 0.30786133, "step": 281, "time_per_iteration": 3.115292549133301 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0113546, "balance_loss_mlp": 1.10580087, "epoch": 0.05425163524432474, "flos": 628361515008.0, "grad_norm": 0.06657413960403659, "language_loss": 0.99515754, "learning_rate": 0.0009984598887910613, "loss": 1.00651217, "num_input_tokens_seen": 22440080, "router_z_loss_mlp": 0.29614258, "step": 282, "time_per_iteration": 2.735640048980713 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01140553, "balance_loss_mlp": 1.10893846, "epoch": 0.0544440169295883, "flos": 615760164864.0, "grad_norm": 0.07881571737542031, "language_loss": 0.95306879, "learning_rate": 0.0009984353583810297, "loss": 0.96447432, "num_input_tokens_seen": 22517936, "router_z_loss_mlp": 0.31616211, "step": 283, "time_per_iteration": 2.8240931034088135 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128602, "balance_loss_mlp": 1.09834647, "epoch": 0.05463639861485187, "flos": 647762406912.0, "grad_norm": 0.0943213260733239, "language_loss": 0.97471213, "learning_rate": 0.0009984106344612302, "loss": 0.98599815, "num_input_tokens_seen": 22590480, "router_z_loss_mlp": 0.30224609, "step": 284, "time_per_iteration": 2.802689790725708 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119254, "balance_loss_mlp": 1.08964229, "epoch": 0.054828780300115426, "flos": 797192699904.0, "grad_norm": 0.0726777825280204, "language_loss": 0.92919928, "learning_rate": 0.0009983857170412615, "loss": 0.94039178, "num_input_tokens_seen": 22668144, "router_z_loss_mlp": 0.29589844, "step": 285, "time_per_iteration": 3.0111782550811768 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01134029, "balance_loss_mlp": 1.10165143, "epoch": 0.05502116198537899, "flos": 549690458112.0, "grad_norm": 0.06957121076923053, "language_loss": 0.92976809, "learning_rate": 0.000998360606130798, "loss": 0.94110835, "num_input_tokens_seen": 22749648, "router_z_loss_mlp": 0.32324219, "step": 286, "time_per_iteration": 2.8221306800842285 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01949249, "balance_loss_mlp": 1.90461755, "epoch": 0.05521354367064255, "flos": 1407753437184.0, "grad_norm": 0.20138197735421756, "language_loss": 0.69073117, "learning_rate": 0.0009983353017395877, "loss": 0.71022367, "num_input_tokens_seen": 22982752, "router_z_loss_mlp": 0.44726562, "step": 287, "time_per_iteration": 4.872509956359863 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01160615, "balance_loss_mlp": 1.12447047, "epoch": 0.05540592535590612, "flos": 645420026880.0, "grad_norm": 0.09083797153449202, "language_loss": 0.98382282, "learning_rate": 0.0009983098038774552, "loss": 0.99542892, "num_input_tokens_seen": 23053584, "router_z_loss_mlp": 0.36132812, "step": 288, "time_per_iteration": 2.7861900329589844 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0156365, "balance_loss_mlp": 1.54524422, "epoch": 0.05559830704116968, "flos": 1511095647744.0, "grad_norm": 0.05039988105800305, "language_loss": 0.78170228, "learning_rate": 0.0009982841125542993, "loss": 0.79733872, "num_input_tokens_seen": 23280256, "router_z_loss_mlp": 0.18359375, "step": 289, "time_per_iteration": 4.809176683425903 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01183294, "balance_loss_mlp": 1.14958155, "epoch": 0.055790688726433245, "flos": 508328980992.0, "grad_norm": 0.11767359006900376, "language_loss": 0.95852768, "learning_rate": 0.0009982582277800948, "loss": 0.9703607, "num_input_tokens_seen": 23345760, "router_z_loss_mlp": 0.33666992, "step": 290, "time_per_iteration": 2.5785539150238037 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0114864, "balance_loss_mlp": 1.11738336, "epoch": 0.055983070411696804, "flos": 657870501888.0, "grad_norm": 0.09005932528563108, "language_loss": 1.03039932, "learning_rate": 0.0009982321495648908, "loss": 1.04188573, "num_input_tokens_seen": 23420720, "router_z_loss_mlp": 0.3125, "step": 291, "time_per_iteration": 2.798412561416626 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01133747, "balance_loss_mlp": 1.10218096, "epoch": 0.05617545209696037, "flos": 587335919616.0, "grad_norm": 0.07041326246084649, "language_loss": 0.9488259, "learning_rate": 0.0009982058779188115, "loss": 0.96016335, "num_input_tokens_seen": 23492576, "router_z_loss_mlp": 0.31542969, "step": 292, "time_per_iteration": 2.7117443084716797 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113109, "balance_loss_mlp": 1.08354521, "epoch": 0.05636783378222393, "flos": 611621632512.0, "grad_norm": 0.0659469171672323, "language_loss": 1.02221513, "learning_rate": 0.0009981794128520567, "loss": 1.0333463, "num_input_tokens_seen": 23569824, "router_z_loss_mlp": 0.29589844, "step": 293, "time_per_iteration": 2.83561372756958 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0113063, "balance_loss_mlp": 1.10104227, "epoch": 0.0565602154674875, "flos": 668161405440.0, "grad_norm": 0.07618014203826041, "language_loss": 0.98908657, "learning_rate": 0.000998152754374901, "loss": 1.00039291, "num_input_tokens_seen": 23649984, "router_z_loss_mlp": 0.2956543, "step": 294, "time_per_iteration": 2.879502773284912 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01133819, "balance_loss_mlp": 1.1052562, "epoch": 0.05675259715275106, "flos": 617242830336.0, "grad_norm": 0.09109925372268521, "language_loss": 0.94850433, "learning_rate": 0.0009981259024976943, "loss": 0.95984244, "num_input_tokens_seen": 23722032, "router_z_loss_mlp": 0.28564453, "step": 295, "time_per_iteration": 2.708038568496704 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01129466, "balance_loss_mlp": 1.10023606, "epoch": 0.05694497883801462, "flos": 751769040384.0, "grad_norm": 0.08548016831625774, "language_loss": 0.92669952, "learning_rate": 0.0009980988572308612, "loss": 0.93799424, "num_input_tokens_seen": 23797376, "router_z_loss_mlp": 0.29248047, "step": 296, "time_per_iteration": 2.99466609954834 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126952, "balance_loss_mlp": 1.09779358, "epoch": 0.05713736052327818, "flos": 712010995200.0, "grad_norm": 0.05751010220277151, "language_loss": 0.96034563, "learning_rate": 0.0009980716185849015, "loss": 0.9716152, "num_input_tokens_seen": 23880496, "router_z_loss_mlp": 0.29174805, "step": 297, "time_per_iteration": 3.0216734409332275 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135652, "balance_loss_mlp": 1.10651755, "epoch": 0.05732974220854175, "flos": 468976200192.0, "grad_norm": 0.06310788330802251, "language_loss": 0.92855394, "learning_rate": 0.0009980441865703904, "loss": 0.93991041, "num_input_tokens_seen": 23950016, "router_z_loss_mlp": 0.29150391, "step": 298, "time_per_iteration": 2.6354267597198486 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124221, "balance_loss_mlp": 1.09456158, "epoch": 0.05752212389380531, "flos": 601422133248.0, "grad_norm": 0.07879622532675779, "language_loss": 1.0091691, "learning_rate": 0.000998016561197978, "loss": 1.02041125, "num_input_tokens_seen": 24020064, "router_z_loss_mlp": 0.29638672, "step": 299, "time_per_iteration": 2.726853370666504 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01104202, "balance_loss_mlp": 1.0768075, "epoch": 0.057714505579068875, "flos": 678664852992.0, "grad_norm": 0.07606317837722033, "language_loss": 0.9243238, "learning_rate": 0.0009979887424783895, "loss": 0.9353658, "num_input_tokens_seen": 24095360, "router_z_loss_mlp": 0.27441406, "step": 300, "time_per_iteration": 2.866880416870117 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.03286275, "balance_loss_mlp": 5.97428513, "diversity_loss_mlp": 0.40086228, "epoch": 0.057906887264332435, "flos": 595884999168.0, "grad_norm": 0.08630620995418306, "language_loss": 1.00780904, "learning_rate": 0.0009979607304224248, "loss": 1.04067183, "num_input_tokens_seen": 24164608, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.09870158, "step": 301, "time_per_iteration": 2.8737847805023193 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101287, "balance_loss_mlp": 1.07100797, "diversity_loss_mlp": 0.0, "epoch": 0.058099268949596, "flos": 552116901888.0, "grad_norm": 0.07465341521099292, "language_loss": 0.98771101, "learning_rate": 0.000997932525040959, "loss": 0.99872386, "num_input_tokens_seen": 24233840, "router_z_loss_mlp": 0.30273438, "routerloss_mlp": 0.0, "step": 302, "time_per_iteration": 2.646038055419922 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097684, "balance_loss_mlp": 1.06912112, "diversity_loss_mlp": 0.0, "epoch": 0.05829165063485956, "flos": 508170765312.0, "grad_norm": 0.0784548088046029, "language_loss": 1.01345074, "learning_rate": 0.000997904126344943, "loss": 1.02442753, "num_input_tokens_seen": 24302928, "router_z_loss_mlp": 0.28527832, "routerloss_mlp": 0.0, "step": 303, "time_per_iteration": 2.607773542404175 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117476, "balance_loss_mlp": 1.08612442, "diversity_loss_mlp": 0.0, "epoch": 0.05848403232012313, "flos": 615231562752.0, "grad_norm": 0.08413175271133923, "language_loss": 0.96722186, "learning_rate": 0.0009978755343454018, "loss": 0.97839665, "num_input_tokens_seen": 24377024, "router_z_loss_mlp": 0.31323242, "routerloss_mlp": 0.0, "step": 304, "time_per_iteration": 2.7423698902130127 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01146235, "balance_loss_mlp": 1.11099684, "diversity_loss_mlp": 0.0, "epoch": 0.05867641400538669, "flos": 500083849728.0, "grad_norm": 0.08591892096672729, "language_loss": 0.97475642, "learning_rate": 0.0009978467490534355, "loss": 0.98621881, "num_input_tokens_seen": 24442736, "router_z_loss_mlp": 0.35229492, "routerloss_mlp": 0.0, "step": 305, "time_per_iteration": 2.5751075744628906 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01144387, "balance_loss_mlp": 1.10974526, "diversity_loss_mlp": 0.0, "epoch": 0.05886879569065025, "flos": 531290244096.0, "grad_norm": 0.06674928608125212, "language_loss": 0.95161211, "learning_rate": 0.00099781777048022, "loss": 0.96305597, "num_input_tokens_seen": 24514800, "router_z_loss_mlp": 0.34667969, "routerloss_mlp": 0.0, "step": 306, "time_per_iteration": 2.697453260421753 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01142445, "balance_loss_mlp": 1.10766006, "diversity_loss_mlp": 0.0, "epoch": 0.05906117737591381, "flos": 489056569344.0, "grad_norm": 0.08714127978238019, "language_loss": 0.96547389, "learning_rate": 0.0009977885986370057, "loss": 0.97689843, "num_input_tokens_seen": 24581648, "router_z_loss_mlp": 0.34790039, "routerloss_mlp": 0.0, "step": 307, "time_per_iteration": 2.555311679840088 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0114348, "balance_loss_mlp": 1.11098385, "diversity_loss_mlp": 0.0, "epoch": 0.05925355906117737, "flos": 591511527936.0, "grad_norm": 0.07630797692789458, "language_loss": 0.93133295, "learning_rate": 0.000997759233535118, "loss": 0.94276774, "num_input_tokens_seen": 24658864, "router_z_loss_mlp": 0.32495117, "routerloss_mlp": 0.0, "step": 308, "time_per_iteration": 2.7760326862335205 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01137991, "balance_loss_mlp": 1.10530353, "diversity_loss_mlp": 0.0, "epoch": 0.05944594074644094, "flos": 563655532032.0, "grad_norm": 0.1535726459245726, "language_loss": 0.98530197, "learning_rate": 0.0009977296751859576, "loss": 0.99668187, "num_input_tokens_seen": 24735808, "router_z_loss_mlp": 0.32666016, "routerloss_mlp": 0.0, "step": 309, "time_per_iteration": 2.7718236446380615 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119868, "balance_loss_mlp": 1.09030402, "diversity_loss_mlp": 0.0, "epoch": 0.0596383224317045, "flos": 538747241472.0, "grad_norm": 0.09363029892750833, "language_loss": 1.00139546, "learning_rate": 0.0009976999236009998, "loss": 1.01259422, "num_input_tokens_seen": 24807744, "router_z_loss_mlp": 0.2956543, "routerloss_mlp": 0.0, "step": 310, "time_per_iteration": 2.7480924129486084 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128418, "balance_loss_mlp": 1.1004039, "diversity_loss_mlp": 0.0, "epoch": 0.059830704116968066, "flos": 560957446656.0, "grad_norm": 0.11799476734746514, "language_loss": 1.01830125, "learning_rate": 0.0009976699787917955, "loss": 1.02958548, "num_input_tokens_seen": 24876640, "router_z_loss_mlp": 0.28051758, "routerloss_mlp": 0.0, "step": 311, "time_per_iteration": 2.6702628135681152 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02237821, "balance_loss_mlp": 2.22513723, "diversity_loss_mlp": 0.0, "epoch": 0.060023085802231625, "flos": 1570615059456.0, "grad_norm": 0.1521885653041848, "language_loss": 0.73442996, "learning_rate": 0.00099763984076997, "loss": 0.75680816, "num_input_tokens_seen": 25110864, "router_z_loss_mlp": 0.12695312, "routerloss_mlp": 0.0, "step": 312, "time_per_iteration": 4.968472480773926 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01934551, "balance_loss_mlp": 3.38140035, "diversity_loss_mlp": 0.39575127, "epoch": 0.06021546748749519, "flos": 482657149440.0, "grad_norm": 0.05936914788699087, "language_loss": 0.983639, "learning_rate": 0.0009976095095472243, "loss": 1.00298452, "num_input_tokens_seen": 25179328, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.04597524, "step": 313, "time_per_iteration": 2.6077775955200195 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01140935, "balance_loss_mlp": 1.11120427, "diversity_loss_mlp": 0.0, "epoch": 0.06040784917275875, "flos": 620195304960.0, "grad_norm": 0.09323488343042824, "language_loss": 0.95392269, "learning_rate": 0.0009975789851353334, "loss": 0.96533203, "num_input_tokens_seen": 25254128, "router_z_loss_mlp": 0.29736328, "routerloss_mlp": 0.0, "step": 314, "time_per_iteration": 2.810530424118042 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01152798, "balance_loss_mlp": 1.12359178, "diversity_loss_mlp": 0.0, "epoch": 0.06060023085802232, "flos": 483553939968.0, "grad_norm": 0.09115128879339694, "language_loss": 0.97407585, "learning_rate": 0.0009975482675461487, "loss": 0.98560387, "num_input_tokens_seen": 25324624, "router_z_loss_mlp": 0.29223633, "routerloss_mlp": 0.0, "step": 315, "time_per_iteration": 2.658961772918701 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01165231, "balance_loss_mlp": 1.13464189, "diversity_loss_mlp": 0.0, "epoch": 0.06079261254328588, "flos": 581892761088.0, "grad_norm": 0.08232329918432242, "language_loss": 0.95008749, "learning_rate": 0.0009975173567915952, "loss": 0.96173978, "num_input_tokens_seen": 25393648, "router_z_loss_mlp": 0.3059082, "routerloss_mlp": 0.0, "step": 316, "time_per_iteration": 2.7026963233947754 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01208938, "balance_loss_mlp": 1.17508304, "diversity_loss_mlp": 0.0, "epoch": 0.060984994228549444, "flos": 687794664960.0, "grad_norm": 0.11734128354988786, "language_loss": 0.89037865, "learning_rate": 0.000997486252883674, "loss": 0.90246803, "num_input_tokens_seen": 25469152, "router_z_loss_mlp": 0.33886719, "routerloss_mlp": 0.0, "step": 317, "time_per_iteration": 2.82440447807312 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01246386, "balance_loss_mlp": 1.21069503, "diversity_loss_mlp": 0.0, "epoch": 0.061177375913813004, "flos": 1314775577088.0, "grad_norm": 0.09191065951965113, "language_loss": 0.94435382, "learning_rate": 0.0009974549558344602, "loss": 0.95681769, "num_input_tokens_seen": 25560944, "router_z_loss_mlp": 0.35693359, "routerloss_mlp": 0.0, "step": 318, "time_per_iteration": 3.6594014167785645 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01256455, "balance_loss_mlp": 1.22028661, "diversity_loss_mlp": 0.0, "epoch": 0.06136975759907657, "flos": 574337018880.0, "grad_norm": 0.10186826507715854, "language_loss": 1.03254342, "learning_rate": 0.000997423465656105, "loss": 1.04510808, "num_input_tokens_seen": 25631424, "router_z_loss_mlp": 0.36181641, "routerloss_mlp": 0.0, "step": 319, "time_per_iteration": 2.7277376651763916 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01228783, "balance_loss_mlp": 1.19342566, "diversity_loss_mlp": 0.0, "epoch": 0.06156213928434013, "flos": 527537152512.0, "grad_norm": 0.07892523617459922, "language_loss": 1.00628281, "learning_rate": 0.0009973917823608335, "loss": 1.01857066, "num_input_tokens_seen": 25698176, "router_z_loss_mlp": 0.35375977, "routerloss_mlp": 0.0, "step": 320, "time_per_iteration": 2.608973503112793 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01216411, "balance_loss_mlp": 1.18279386, "diversity_loss_mlp": 0.0, "epoch": 0.061754520969603696, "flos": 495507746304.0, "grad_norm": 0.08046246772740448, "language_loss": 0.96186835, "learning_rate": 0.0009973599059609462, "loss": 0.9740324, "num_input_tokens_seen": 25773472, "router_z_loss_mlp": 0.33618164, "routerloss_mlp": 0.0, "step": 321, "time_per_iteration": 2.736543655395508 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01188588, "balance_loss_mlp": 1.15735531, "diversity_loss_mlp": 0.0, "epoch": 0.061946902654867256, "flos": 440079879168.0, "grad_norm": 0.06958940991484033, "language_loss": 0.93877137, "learning_rate": 0.000997327836468819, "loss": 0.95065725, "num_input_tokens_seen": 25841088, "router_z_loss_mlp": 0.31225586, "routerloss_mlp": 0.0, "step": 322, "time_per_iteration": 2.6034624576568604 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01172579, "balance_loss_mlp": 1.14392066, "diversity_loss_mlp": 0.0, "epoch": 0.06213928434013082, "flos": 598800397824.0, "grad_norm": 0.10097410409674823, "language_loss": 0.96476239, "learning_rate": 0.000997295573896902, "loss": 0.97648811, "num_input_tokens_seen": 25919424, "router_z_loss_mlp": 0.28649902, "routerloss_mlp": 0.0, "step": 323, "time_per_iteration": 2.8207039833068848 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02388506, "balance_loss_mlp": 2.37343788, "diversity_loss_mlp": 0.0, "epoch": 0.06233166602539438, "flos": 1450135789056.0, "grad_norm": 0.2858946964689234, "language_loss": 0.8119604, "learning_rate": 0.000997263118257721, "loss": 0.83584547, "num_input_tokens_seen": 26135504, "router_z_loss_mlp": 0.15039062, "routerloss_mlp": 0.0, "step": 324, "time_per_iteration": 4.691263437271118 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01793915, "balance_loss_mlp": 1.78142214, "diversity_loss_mlp": 0.0, "epoch": 0.06252404771065795, "flos": 1463327036928.0, "grad_norm": 0.11944332826526777, "language_loss": 0.78571939, "learning_rate": 0.0009972304695638763, "loss": 0.80365855, "num_input_tokens_seen": 26358880, "router_z_loss_mlp": 0.125, "routerloss_mlp": 0.0, "step": 325, "time_per_iteration": 4.837715148925781 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01214832, "balance_loss_mlp": 1.18657923, "diversity_loss_mlp": 0.0, "epoch": 0.06271642939592151, "flos": 464294011392.0, "grad_norm": 0.0814388529334085, "language_loss": 0.91516924, "learning_rate": 0.000997197627828043, "loss": 0.92731762, "num_input_tokens_seen": 26425888, "router_z_loss_mlp": 0.2824707, "routerloss_mlp": 0.0, "step": 326, "time_per_iteration": 2.5261096954345703 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01228602, "balance_loss_mlp": 1.20018268, "diversity_loss_mlp": 0.0, "epoch": 0.06290881108118507, "flos": 532374985728.0, "grad_norm": 0.08774897428196327, "language_loss": 0.86495018, "learning_rate": 0.0009971645930629716, "loss": 0.87723619, "num_input_tokens_seen": 26500656, "router_z_loss_mlp": 0.28442383, "routerloss_mlp": 0.0, "step": 327, "time_per_iteration": 2.73193621635437 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01236303, "balance_loss_mlp": 1.20914674, "diversity_loss_mlp": 0.0, "epoch": 0.06310119276644863, "flos": 673562718720.0, "grad_norm": 0.0823367638378532, "language_loss": 0.99889791, "learning_rate": 0.0009971313652814872, "loss": 1.01126099, "num_input_tokens_seen": 26577408, "router_z_loss_mlp": 0.2722168, "routerloss_mlp": 0.0, "step": 328, "time_per_iteration": 2.79278826713562 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01224995, "balance_loss_mlp": 1.1973865, "diversity_loss_mlp": 0.0, "epoch": 0.0632935744517122, "flos": 770732734464.0, "grad_norm": 0.1407341288256049, "language_loss": 0.97435188, "learning_rate": 0.0009970979444964903, "loss": 0.98660183, "num_input_tokens_seen": 26652048, "router_z_loss_mlp": 0.27636719, "routerloss_mlp": 0.0, "step": 329, "time_per_iteration": 2.9955334663391113 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01213807, "balance_loss_mlp": 1.18553066, "diversity_loss_mlp": 0.0, "epoch": 0.06348595613697576, "flos": 561913708032.0, "grad_norm": 0.10291010686297611, "language_loss": 0.9869082, "learning_rate": 0.0009970643307209556, "loss": 0.99904621, "num_input_tokens_seen": 26728192, "router_z_loss_mlp": 0.28295898, "routerloss_mlp": 0.0, "step": 330, "time_per_iteration": 2.79775071144104 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01202809, "balance_loss_mlp": 1.17248201, "diversity_loss_mlp": 0.0, "epoch": 0.06367833782223932, "flos": 676189223424.0, "grad_norm": 0.08231148280507655, "language_loss": 0.94842714, "learning_rate": 0.0009970305239679334, "loss": 0.96045524, "num_input_tokens_seen": 26798016, "router_z_loss_mlp": 0.30322266, "routerloss_mlp": 0.0, "step": 331, "time_per_iteration": 2.802400827407837 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01203401, "balance_loss_mlp": 1.17300248, "diversity_loss_mlp": 0.0, "epoch": 0.06387071950750288, "flos": 495297773568.0, "grad_norm": 0.08804880344809486, "language_loss": 0.99692816, "learning_rate": 0.0009969965242505483, "loss": 1.00896215, "num_input_tokens_seen": 26867536, "router_z_loss_mlp": 0.30371094, "routerloss_mlp": 0.0, "step": 332, "time_per_iteration": 2.634702682495117 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01224958, "balance_loss_mlp": 1.19243741, "diversity_loss_mlp": 0.0, "epoch": 0.06406310119276645, "flos": 533447244288.0, "grad_norm": 0.06414677867033303, "language_loss": 0.95931363, "learning_rate": 0.0009969623315820007, "loss": 0.97156322, "num_input_tokens_seen": 26941216, "router_z_loss_mlp": 0.32470703, "routerloss_mlp": 0.0, "step": 333, "time_per_iteration": 2.6661436557769775 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01245141, "balance_loss_mlp": 1.21149969, "diversity_loss_mlp": 0.0, "epoch": 0.06425548287803001, "flos": 456184700928.0, "grad_norm": 0.06624608002660057, "language_loss": 0.9590115, "learning_rate": 0.000996927945975565, "loss": 0.97146285, "num_input_tokens_seen": 27006560, "router_z_loss_mlp": 0.33618164, "routerloss_mlp": 0.0, "step": 334, "time_per_iteration": 2.576922655105591 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01252992, "balance_loss_mlp": 1.21672821, "diversity_loss_mlp": 0.0, "epoch": 0.06444786456329357, "flos": 560077908480.0, "grad_norm": 0.07108304231036514, "language_loss": 0.93002915, "learning_rate": 0.0009968933674445906, "loss": 0.94255906, "num_input_tokens_seen": 27076400, "router_z_loss_mlp": 0.36230469, "routerloss_mlp": 0.0, "step": 335, "time_per_iteration": 2.706836462020874 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01267675, "balance_loss_mlp": 1.23026776, "diversity_loss_mlp": 0.0, "epoch": 0.06464024624855713, "flos": 666085897728.0, "grad_norm": 0.0701420022906001, "language_loss": 0.95153642, "learning_rate": 0.0009968585960025028, "loss": 0.96421325, "num_input_tokens_seen": 27158672, "router_z_loss_mlp": 0.37402344, "routerloss_mlp": 0.0, "step": 336, "time_per_iteration": 2.9356396198272705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01838771, "balance_loss_mlp": 1.81416643, "diversity_loss_mlp": 0.0, "epoch": 0.0648326279338207, "flos": 1521371870208.0, "grad_norm": 0.09587986506557475, "language_loss": 0.77653188, "learning_rate": 0.0009968236316628006, "loss": 0.79491967, "num_input_tokens_seen": 27380592, "router_z_loss_mlp": 0.24511719, "routerloss_mlp": 0.0, "step": 337, "time_per_iteration": 4.784119606018066 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01242978, "balance_loss_mlp": 1.20874155, "diversity_loss_mlp": 0.0, "epoch": 0.06502500961908426, "flos": 1143339909120.0, "grad_norm": 0.1007121907193806, "language_loss": 0.9314844, "learning_rate": 0.0009967884744390583, "loss": 0.94391423, "num_input_tokens_seen": 27469984, "router_z_loss_mlp": 0.3425293, "routerloss_mlp": 0.0, "step": 338, "time_per_iteration": 3.5315823554992676 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01209945, "balance_loss_mlp": 1.1758039, "diversity_loss_mlp": 0.0, "epoch": 0.06521739130434782, "flos": 582609314304.0, "grad_norm": 0.10820011352875603, "language_loss": 0.93812096, "learning_rate": 0.0009967531243449256, "loss": 0.95022047, "num_input_tokens_seen": 27543904, "router_z_loss_mlp": 0.34130859, "routerloss_mlp": 0.0, "step": 339, "time_per_iteration": 2.6663827896118164 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01172072, "balance_loss_mlp": 1.13959908, "diversity_loss_mlp": 0.0, "epoch": 0.06540977298961138, "flos": 497650065408.0, "grad_norm": 0.07246387309668721, "language_loss": 1.014539, "learning_rate": 0.000996717581394126, "loss": 1.02625966, "num_input_tokens_seen": 27609888, "router_z_loss_mlp": 0.32470703, "routerloss_mlp": 0.0, "step": 340, "time_per_iteration": 2.5849766731262207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01142038, "balance_loss_mlp": 1.11142516, "diversity_loss_mlp": 0.0, "epoch": 0.06560215467487496, "flos": 542871092736.0, "grad_norm": 0.07622939946709405, "language_loss": 1.01788783, "learning_rate": 0.000996681845600459, "loss": 1.0293082, "num_input_tokens_seen": 27683936, "router_z_loss_mlp": 0.30615234, "routerloss_mlp": 0.0, "step": 341, "time_per_iteration": 2.6651370525360107 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138836, "balance_loss_mlp": 1.10901034, "diversity_loss_mlp": 0.0, "epoch": 0.06579453636013852, "flos": 413454357504.0, "grad_norm": 0.06359259902727714, "language_loss": 0.94080132, "learning_rate": 0.0009966459169777982, "loss": 0.95218974, "num_input_tokens_seen": 27747840, "router_z_loss_mlp": 0.29785156, "routerloss_mlp": 0.0, "step": 342, "time_per_iteration": 2.524775981903076 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136152, "balance_loss_mlp": 1.10670757, "diversity_loss_mlp": 0.0, "epoch": 0.06598691804540208, "flos": 560618993664.0, "grad_norm": 0.07912610309003802, "language_loss": 1.03090763, "learning_rate": 0.0009966097955400924, "loss": 1.04226899, "num_input_tokens_seen": 27819728, "router_z_loss_mlp": 0.29418945, "routerloss_mlp": 0.0, "step": 343, "time_per_iteration": 2.662269115447998 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074802, "balance_loss_mlp": 1.74366593, "diversity_loss_mlp": 0.35364389, "epoch": 0.06617929973066564, "flos": 572090812416.0, "grad_norm": 0.10968898462568231, "language_loss": 0.99445379, "learning_rate": 0.0009965734813013652, "loss": 1.00520182, "num_input_tokens_seen": 27893536, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.02614743, "step": 344, "time_per_iteration": 2.82026743888855 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138748, "balance_loss_mlp": 1.10989952, "diversity_loss_mlp": 0.0, "epoch": 0.06637168141592921, "flos": 490479763968.0, "grad_norm": 0.13046244738635646, "language_loss": 0.99630761, "learning_rate": 0.0009965369742757151, "loss": 1.00769508, "num_input_tokens_seen": 27960976, "router_z_loss_mlp": 0.28833008, "routerloss_mlp": 0.0, "step": 345, "time_per_iteration": 2.565809965133667 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112942, "balance_loss_mlp": 1.10131097, "diversity_loss_mlp": 0.0, "epoch": 0.06656406310119277, "flos": 1079194834944.0, "grad_norm": 0.1120170016707216, "language_loss": 0.96858162, "learning_rate": 0.0009965002744773152, "loss": 0.9798758, "num_input_tokens_seen": 28050864, "router_z_loss_mlp": 0.28125, "routerloss_mlp": 0.0, "step": 346, "time_per_iteration": 3.52542781829834 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01144914, "balance_loss_mlp": 1.1170671, "diversity_loss_mlp": 0.0, "epoch": 0.06675644478645633, "flos": 513680735232.0, "grad_norm": 0.08447825810050776, "language_loss": 0.93369007, "learning_rate": 0.0009964633819204139, "loss": 0.94513917, "num_input_tokens_seen": 28122448, "router_z_loss_mlp": 0.27832031, "routerloss_mlp": 0.0, "step": 347, "time_per_iteration": 2.6504640579223633 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02729187, "balance_loss_mlp": 2.68856025, "diversity_loss_mlp": 0.0, "epoch": 0.06694882647171989, "flos": 1447192479744.0, "grad_norm": 0.36365581545094156, "language_loss": 0.81801116, "learning_rate": 0.0009964262966193338, "loss": 0.84530306, "num_input_tokens_seen": 28350352, "router_z_loss_mlp": 0.40625, "routerloss_mlp": 0.0, "step": 348, "time_per_iteration": 4.9217259883880615 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01886969, "balance_loss_mlp": 1.8606472, "diversity_loss_mlp": 0.0, "epoch": 0.06714120815698346, "flos": 1552061772288.0, "grad_norm": 0.11180228987157655, "language_loss": 0.75153887, "learning_rate": 0.000996389018588473, "loss": 0.77040851, "num_input_tokens_seen": 28585584, "router_z_loss_mlp": 0.26367188, "routerloss_mlp": 0.0, "step": 349, "time_per_iteration": 4.915479898452759 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01148873, "balance_loss_mlp": 1.11942816, "diversity_loss_mlp": 0.0, "epoch": 0.06733358984224702, "flos": 880073869824.0, "grad_norm": 0.08620115988858058, "language_loss": 0.93105251, "learning_rate": 0.000996351547842304, "loss": 0.94254124, "num_input_tokens_seen": 28672512, "router_z_loss_mlp": 0.29443359, "routerloss_mlp": 0.0, "step": 350, "time_per_iteration": 3.2273383140563965 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01183797, "balance_loss_mlp": 1.152946, "diversity_loss_mlp": 0.0, "epoch": 0.06752597152751058, "flos": 518906580480.0, "grad_norm": 0.10656846418921655, "language_loss": 0.91589314, "learning_rate": 0.0009963138843953744, "loss": 0.92773116, "num_input_tokens_seen": 28741520, "router_z_loss_mlp": 0.30810547, "routerloss_mlp": 0.0, "step": 351, "time_per_iteration": 2.6443302631378174 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0122224, "balance_loss_mlp": 1.19079256, "diversity_loss_mlp": 0.0, "epoch": 0.06771835321277414, "flos": 539668624896.0, "grad_norm": 0.12218392571909323, "language_loss": 0.95582229, "learning_rate": 0.000996276028262306, "loss": 0.9680447, "num_input_tokens_seen": 28814912, "router_z_loss_mlp": 0.31420898, "routerloss_mlp": 0.0, "step": 352, "time_per_iteration": 2.819287061691284 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0121763, "balance_loss_mlp": 1.18711233, "diversity_loss_mlp": 0.0, "epoch": 0.0679107348980377, "flos": 460666828800.0, "grad_norm": 0.14903684788896404, "language_loss": 1.01496267, "learning_rate": 0.0009962379794577964, "loss": 1.02713895, "num_input_tokens_seen": 28882192, "router_z_loss_mlp": 0.30493164, "routerloss_mlp": 0.0, "step": 353, "time_per_iteration": 2.591759204864502 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0123139, "balance_loss_mlp": 1.2003479, "diversity_loss_mlp": 0.0, "epoch": 0.06810311658330127, "flos": 635922026496.0, "grad_norm": 0.0632056956592815, "language_loss": 0.9195236, "learning_rate": 0.000996199737996617, "loss": 0.9318375, "num_input_tokens_seen": 28968576, "router_z_loss_mlp": 0.31005859, "routerloss_mlp": 0.0, "step": 354, "time_per_iteration": 2.889040231704712 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01209696, "balance_loss_mlp": 1.17963195, "diversity_loss_mlp": 0.0, "epoch": 0.06829549826856483, "flos": 464679452160.0, "grad_norm": 0.07119928644727336, "language_loss": 1.00405252, "learning_rate": 0.0009961613038936149, "loss": 1.0161494, "num_input_tokens_seen": 29036160, "router_z_loss_mlp": 0.30029297, "routerloss_mlp": 0.0, "step": 355, "time_per_iteration": 2.5856525897979736 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01187257, "balance_loss_mlp": 1.15755057, "diversity_loss_mlp": 0.0, "epoch": 0.06848787995382839, "flos": 634647135744.0, "grad_norm": 0.07116362106359332, "language_loss": 0.93361115, "learning_rate": 0.000996122677163711, "loss": 0.9454838, "num_input_tokens_seen": 29112048, "router_z_loss_mlp": 0.296875, "routerloss_mlp": 0.0, "step": 356, "time_per_iteration": 2.8134818077087402 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01213028, "balance_loss_mlp": 1.18367887, "diversity_loss_mlp": 0.0, "epoch": 0.06868026163909195, "flos": 806374268928.0, "grad_norm": 0.08014414191517881, "language_loss": 0.98940754, "learning_rate": 0.000996083857821902, "loss": 1.0015378, "num_input_tokens_seen": 29190960, "router_z_loss_mlp": 0.29345703, "routerloss_mlp": 0.0, "step": 357, "time_per_iteration": 3.0531890392303467 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01237281, "balance_loss_mlp": 1.20714498, "diversity_loss_mlp": 0.0, "epoch": 0.06887264332435553, "flos": 439227505152.0, "grad_norm": 0.06260381392843543, "language_loss": 0.96791607, "learning_rate": 0.0009960448458832588, "loss": 0.98028892, "num_input_tokens_seen": 29262832, "router_z_loss_mlp": 0.30126953, "routerloss_mlp": 0.0, "step": 358, "time_per_iteration": 2.696443557739258 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01236116, "balance_loss_mlp": 1.20750594, "diversity_loss_mlp": 0.0, "epoch": 0.06906502500961909, "flos": 484767161856.0, "grad_norm": 0.07177130169486132, "language_loss": 0.96227086, "learning_rate": 0.000996005641362927, "loss": 0.97463197, "num_input_tokens_seen": 29329552, "router_z_loss_mlp": 0.28637695, "routerloss_mlp": 0.0, "step": 359, "time_per_iteration": 2.58060884475708 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01229528, "balance_loss_mlp": 1.19984436, "diversity_loss_mlp": 0.0, "epoch": 0.06925740669488265, "flos": 733611105792.0, "grad_norm": 0.09877521418753983, "language_loss": 0.99257219, "learning_rate": 0.0009959662442761274, "loss": 1.00486755, "num_input_tokens_seen": 29410784, "router_z_loss_mlp": 0.29663086, "routerloss_mlp": 0.0, "step": 360, "time_per_iteration": 2.8970725536346436 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01241998, "balance_loss_mlp": 1.21033561, "diversity_loss_mlp": 0.0, "epoch": 0.0694497883801462, "flos": 552415707648.0, "grad_norm": 0.07509157549903762, "language_loss": 0.93086261, "learning_rate": 0.000995926654638155, "loss": 0.9432826, "num_input_tokens_seen": 29486992, "router_z_loss_mlp": 0.31640625, "routerloss_mlp": 0.0, "step": 361, "time_per_iteration": 2.787796974182129 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01225169, "balance_loss_mlp": 1.19405532, "diversity_loss_mlp": 0.0, "epoch": 0.06964217006540978, "flos": 678015111168.0, "grad_norm": 0.08313329413520473, "language_loss": 0.94580126, "learning_rate": 0.00099588687246438, "loss": 0.95805293, "num_input_tokens_seen": 29557232, "router_z_loss_mlp": 0.31103516, "routerloss_mlp": 0.0, "step": 362, "time_per_iteration": 2.826186418533325 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01188587, "balance_loss_mlp": 1.15785527, "diversity_loss_mlp": 0.0, "epoch": 0.06983455175067334, "flos": 524241082368.0, "grad_norm": 0.12654684897021498, "language_loss": 1.02203465, "learning_rate": 0.0009958468977702471, "loss": 1.03392053, "num_input_tokens_seen": 29625344, "router_z_loss_mlp": 0.30712891, "routerloss_mlp": 0.0, "step": 363, "time_per_iteration": 2.5915637016296387 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02117372, "balance_loss_mlp": 1.97470212, "diversity_loss_mlp": 0.0, "epoch": 0.0700269334359369, "flos": 1576787254272.0, "grad_norm": 0.12517092959889778, "language_loss": 0.79734707, "learning_rate": 0.0009958067305712761, "loss": 0.81852078, "num_input_tokens_seen": 29843664, "router_z_loss_mlp": 1.4296875, "routerloss_mlp": 0.0, "step": 364, "time_per_iteration": 4.79950737953186 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01195198, "balance_loss_mlp": 1.16406059, "diversity_loss_mlp": 0.0, "epoch": 0.07021931512120046, "flos": 1013248839168.0, "grad_norm": 0.08484436116426784, "language_loss": 0.90580225, "learning_rate": 0.0009957663708830612, "loss": 0.91775423, "num_input_tokens_seen": 29927152, "router_z_loss_mlp": 0.31152344, "routerloss_mlp": 0.0, "step": 365, "time_per_iteration": 3.2616662979125977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0119947, "balance_loss_mlp": 1.16575801, "diversity_loss_mlp": 0.0, "epoch": 0.07041169680646403, "flos": 822983099904.0, "grad_norm": 0.10575932689534903, "language_loss": 0.93159938, "learning_rate": 0.0009957258187212714, "loss": 0.9435941, "num_input_tokens_seen": 30004928, "router_z_loss_mlp": 0.33740234, "routerloss_mlp": 0.0, "step": 366, "time_per_iteration": 3.0113134384155273 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02012454, "balance_loss_mlp": 1.90030205, "diversity_loss_mlp": 0.0, "epoch": 0.07060407849172759, "flos": 1414392938496.0, "grad_norm": 0.0781885975604906, "language_loss": 0.79194862, "learning_rate": 0.0009956850741016502, "loss": 0.81207317, "num_input_tokens_seen": 30230256, "router_z_loss_mlp": 1.125, "routerloss_mlp": 0.0, "step": 367, "time_per_iteration": 4.857182502746582 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01238272, "balance_loss_mlp": 1.20377314, "diversity_loss_mlp": 0.0, "epoch": 0.07079646017699115, "flos": 512909853696.0, "grad_norm": 0.10459556468103207, "language_loss": 0.9040041, "learning_rate": 0.0009956441370400167, "loss": 0.91638684, "num_input_tokens_seen": 30301200, "router_z_loss_mlp": 0.34472656, "routerloss_mlp": 0.0, "step": 368, "time_per_iteration": 2.6384623050689697 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01212552, "balance_loss_mlp": 1.17986465, "diversity_loss_mlp": 0.0, "epoch": 0.07098884186225471, "flos": 540501548544.0, "grad_norm": 0.11871319311308551, "language_loss": 0.96155751, "learning_rate": 0.0009956030075522636, "loss": 0.973683, "num_input_tokens_seen": 30377024, "router_z_loss_mlp": 0.3269043, "routerloss_mlp": 0.0, "step": 369, "time_per_iteration": 2.7690951824188232 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098327, "balance_loss_mlp": 1.85686088, "diversity_loss_mlp": 0.26596725, "epoch": 0.07118122354751828, "flos": 548682439680.0, "grad_norm": 0.0445321938876095, "language_loss": 0.99161661, "learning_rate": 0.0009955616856543587, "loss": 1.00259984, "num_input_tokens_seen": 30448896, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.03691306, "step": 370, "time_per_iteration": 2.6551451683044434 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136923, "balance_loss_mlp": 1.10690594, "diversity_loss_mlp": 0.0, "epoch": 0.07137360523278184, "flos": 620904517632.0, "grad_norm": 0.06345816714032589, "language_loss": 0.89315635, "learning_rate": 0.0009955201713623448, "loss": 0.90452558, "num_input_tokens_seen": 30523584, "router_z_loss_mlp": 0.29980469, "routerloss_mlp": 0.0, "step": 371, "time_per_iteration": 2.7738049030303955 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01981215, "balance_loss_mlp": 1.93124223, "diversity_loss_mlp": 0.0, "epoch": 0.0715659869180454, "flos": 1502672477184.0, "grad_norm": 0.16358882606758401, "language_loss": 0.76672721, "learning_rate": 0.000995478464692339, "loss": 0.78653932, "num_input_tokens_seen": 30757920, "router_z_loss_mlp": 0.5, "routerloss_mlp": 0.0, "step": 372, "time_per_iteration": 4.94252347946167 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117773, "balance_loss_mlp": 1.08999681, "diversity_loss_mlp": 0.0, "epoch": 0.07175836860330896, "flos": 495493065216.0, "grad_norm": 0.14652608757044766, "language_loss": 1.03006279, "learning_rate": 0.0009954365656605333, "loss": 1.04124057, "num_input_tokens_seen": 30824960, "router_z_loss_mlp": 0.27783203, "routerloss_mlp": 0.0, "step": 373, "time_per_iteration": 2.551156759262085 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138367, "balance_loss_mlp": 1.10901785, "diversity_loss_mlp": 0.0, "epoch": 0.07195075028857253, "flos": 785725650432.0, "grad_norm": 0.09116429227244367, "language_loss": 0.95790577, "learning_rate": 0.0009953944742831947, "loss": 0.96928942, "num_input_tokens_seen": 30902224, "router_z_loss_mlp": 0.29296875, "routerloss_mlp": 0.0, "step": 374, "time_per_iteration": 2.995286226272583 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01159694, "balance_loss_mlp": 1.13084567, "diversity_loss_mlp": 0.0, "epoch": 0.0721431319738361, "flos": 593107619328.0, "grad_norm": 0.10582188185488459, "language_loss": 0.99257255, "learning_rate": 0.0009953521905766642, "loss": 1.00416946, "num_input_tokens_seen": 30984784, "router_z_loss_mlp": 0.28808594, "routerloss_mlp": 0.0, "step": 375, "time_per_iteration": 2.946237325668335 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01186879, "balance_loss_mlp": 1.15664721, "diversity_loss_mlp": 0.0, "epoch": 0.07233551365909965, "flos": 548250011136.0, "grad_norm": 0.09648654328935216, "language_loss": 0.97696835, "learning_rate": 0.0009953097145573577, "loss": 0.98883718, "num_input_tokens_seen": 31055376, "router_z_loss_mlp": 0.30200195, "routerloss_mlp": 0.0, "step": 376, "time_per_iteration": 2.64080548286438 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0119333, "balance_loss_mlp": 1.16164398, "diversity_loss_mlp": 0.0, "epoch": 0.07252789534436321, "flos": 957568780800.0, "grad_norm": 0.11805021949506506, "language_loss": 0.95023847, "learning_rate": 0.000995267046241766, "loss": 0.96217185, "num_input_tokens_seen": 31144944, "router_z_loss_mlp": 0.31689453, "routerloss_mlp": 0.0, "step": 377, "time_per_iteration": 3.2120020389556885 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01188603, "balance_loss_mlp": 1.15617776, "diversity_loss_mlp": 0.0, "epoch": 0.07272027702962677, "flos": 507649503744.0, "grad_norm": 0.10215127385841216, "language_loss": 0.94931126, "learning_rate": 0.0009952241856464547, "loss": 0.96119732, "num_input_tokens_seen": 31213392, "router_z_loss_mlp": 0.32421875, "routerloss_mlp": 0.0, "step": 378, "time_per_iteration": 2.595047950744629 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01183617, "balance_loss_mlp": 1.14971423, "diversity_loss_mlp": 0.0, "epoch": 0.07291265871489035, "flos": 612412337664.0, "grad_norm": 0.08294465031859817, "language_loss": 1.01604176, "learning_rate": 0.0009951811327880632, "loss": 1.02787805, "num_input_tokens_seen": 31289840, "router_z_loss_mlp": 0.33911133, "routerloss_mlp": 0.0, "step": 379, "time_per_iteration": 2.7318813800811768 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01173744, "balance_loss_mlp": 1.13891101, "diversity_loss_mlp": 0.0, "epoch": 0.0731050404001539, "flos": 495750025728.0, "grad_norm": 0.06744176383892367, "language_loss": 0.94898254, "learning_rate": 0.0009951378876833063, "loss": 0.96071994, "num_input_tokens_seen": 31357600, "router_z_loss_mlp": 0.34838867, "routerloss_mlp": 0.0, "step": 380, "time_per_iteration": 2.565268039703369 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01198329, "balance_loss_mlp": 1.16392517, "diversity_loss_mlp": 0.0, "epoch": 0.07329742208541747, "flos": 639966956544.0, "grad_norm": 0.08808941505023588, "language_loss": 1.01867247, "learning_rate": 0.0009950944503489736, "loss": 1.03065586, "num_input_tokens_seen": 31428896, "router_z_loss_mlp": 0.34399414, "routerloss_mlp": 0.0, "step": 381, "time_per_iteration": 2.7605583667755127 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01220014, "balance_loss_mlp": 1.18479919, "diversity_loss_mlp": 0.0, "epoch": 0.07348980377068103, "flos": 816346543104.0, "grad_norm": 0.09503573620830386, "language_loss": 0.95487726, "learning_rate": 0.0009950508208019285, "loss": 0.96707737, "num_input_tokens_seen": 31507424, "router_z_loss_mlp": 0.35253906, "routerloss_mlp": 0.0, "step": 382, "time_per_iteration": 3.023996591567993 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01224507, "balance_loss_mlp": 1.19086623, "diversity_loss_mlp": 0.0, "epoch": 0.0736821854559446, "flos": 508640269824.0, "grad_norm": 0.09021711867793632, "language_loss": 1.0023253, "learning_rate": 0.0009950069990591096, "loss": 1.01457047, "num_input_tokens_seen": 31576768, "router_z_loss_mlp": 0.33666992, "routerloss_mlp": 0.0, "step": 383, "time_per_iteration": 2.62634015083313 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02435347, "balance_loss_mlp": 2.36668229, "diversity_loss_mlp": 0.0, "epoch": 0.07387456714120816, "flos": 1554648629760.0, "grad_norm": 0.252441104666548, "language_loss": 0.76401371, "learning_rate": 0.0009949629851375302, "loss": 0.78836709, "num_input_tokens_seen": 31797312, "router_z_loss_mlp": 0.6875, "routerloss_mlp": 0.0, "step": 384, "time_per_iteration": 4.887000322341919 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01205074, "balance_loss_mlp": 1.17217231, "diversity_loss_mlp": 0.0, "epoch": 0.07406694882647172, "flos": 525503490048.0, "grad_norm": 0.13776686153508858, "language_loss": 0.92669415, "learning_rate": 0.0009949187790542777, "loss": 0.93874478, "num_input_tokens_seen": 31869568, "router_z_loss_mlp": 0.32910156, "routerloss_mlp": 0.0, "step": 385, "time_per_iteration": 2.7325563430786133 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01158547, "balance_loss_mlp": 1.12683773, "diversity_loss_mlp": 0.0, "epoch": 0.07425933051173528, "flos": 497738898432.0, "grad_norm": 0.09404920935129117, "language_loss": 0.89306223, "learning_rate": 0.0009948743808265148, "loss": 0.90464771, "num_input_tokens_seen": 31941712, "router_z_loss_mlp": 0.31689453, "routerloss_mlp": 0.0, "step": 386, "time_per_iteration": 2.723581314086914 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01152345, "balance_loss_mlp": 1.12321043, "diversity_loss_mlp": 0.0, "epoch": 0.07445171219699885, "flos": 505003175424.0, "grad_norm": 0.11553674714385681, "language_loss": 0.98625511, "learning_rate": 0.0009948297904714782, "loss": 0.99777853, "num_input_tokens_seen": 32015232, "router_z_loss_mlp": 0.29125977, "routerloss_mlp": 0.0, "step": 387, "time_per_iteration": 2.6925902366638184 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01152671, "balance_loss_mlp": 1.12460923, "diversity_loss_mlp": 0.0, "epoch": 0.07464409388226241, "flos": 553977294336.0, "grad_norm": 0.10281917509950625, "language_loss": 0.91430104, "learning_rate": 0.0009947850080064796, "loss": 0.92582774, "num_input_tokens_seen": 32094640, "router_z_loss_mlp": 0.28076172, "routerloss_mlp": 0.0, "step": 388, "time_per_iteration": 2.7813222408294678 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051708, "balance_loss_mlp": 1.80238378, "diversity_loss_mlp": 0.24433145, "epoch": 0.07483647556752597, "flos": 776862710784.0, "grad_norm": 0.03140321958098528, "language_loss": 0.96549261, "learning_rate": 0.0009947400334489047, "loss": 0.97600979, "num_input_tokens_seen": 32176640, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0283502, "step": 389, "time_per_iteration": 3.055640459060669 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0114605, "balance_loss_mlp": 1.11867988, "diversity_loss_mlp": 0.0, "epoch": 0.07502885725278953, "flos": 612540817920.0, "grad_norm": 0.10120121915973303, "language_loss": 0.87344396, "learning_rate": 0.0009946948668162145, "loss": 0.88490444, "num_input_tokens_seen": 32246704, "router_z_loss_mlp": 0.27392578, "routerloss_mlp": 0.0, "step": 390, "time_per_iteration": 2.7240688800811768 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01159261, "balance_loss_mlp": 1.13079381, "diversity_loss_mlp": 0.0, "epoch": 0.0752212389380531, "flos": 688629786624.0, "grad_norm": 0.0733706931740777, "language_loss": 0.92598295, "learning_rate": 0.0009946495081259441, "loss": 0.93757558, "num_input_tokens_seen": 32320032, "router_z_loss_mlp": 0.28466797, "routerloss_mlp": 0.0, "step": 391, "time_per_iteration": 2.8451168537139893 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01145886, "balance_loss_mlp": 1.11753774, "diversity_loss_mlp": 0.0, "epoch": 0.07541362062331666, "flos": 765699609600.0, "grad_norm": 0.0986246500370879, "language_loss": 0.95604634, "learning_rate": 0.0009946039573957035, "loss": 0.96750522, "num_input_tokens_seen": 32398144, "router_z_loss_mlp": 0.28344727, "routerloss_mlp": 0.0, "step": 392, "time_per_iteration": 2.943962574005127 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01142945, "balance_loss_mlp": 1.11550307, "diversity_loss_mlp": 0.0, "epoch": 0.07560600230858022, "flos": 588749202432.0, "grad_norm": 0.0698233472363084, "language_loss": 0.92221498, "learning_rate": 0.000994558214643177, "loss": 0.93364441, "num_input_tokens_seen": 32471984, "router_z_loss_mlp": 0.27441406, "routerloss_mlp": 0.0, "step": 393, "time_per_iteration": 2.7336390018463135 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01137637, "balance_loss_mlp": 1.10933709, "diversity_loss_mlp": 0.0, "epoch": 0.07579838399384378, "flos": 749834496000.0, "grad_norm": 0.0667709001177297, "language_loss": 0.93581867, "learning_rate": 0.000994512279886123, "loss": 0.94719505, "num_input_tokens_seen": 32550176, "router_z_loss_mlp": 0.28295898, "routerloss_mlp": 0.0, "step": 394, "time_per_iteration": 3.0792524814605713 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01148773, "balance_loss_mlp": 1.12104487, "diversity_loss_mlp": 0.0, "epoch": 0.07599076567910736, "flos": 523457717760.0, "grad_norm": 0.057306164352953166, "language_loss": 0.94243777, "learning_rate": 0.0009944661531423758, "loss": 0.95392549, "num_input_tokens_seen": 32620768, "router_z_loss_mlp": 0.27758789, "routerloss_mlp": 0.0, "step": 395, "time_per_iteration": 2.7003707885742188 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01169709, "balance_loss_mlp": 1.14162326, "diversity_loss_mlp": 0.0, "epoch": 0.07618314736437092, "flos": 551086488576.0, "grad_norm": 0.09187664036534561, "language_loss": 0.92709243, "learning_rate": 0.000994419834429843, "loss": 0.93878949, "num_input_tokens_seen": 32693472, "router_z_loss_mlp": 0.28125, "routerloss_mlp": 0.0, "step": 396, "time_per_iteration": 2.654961109161377 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01184579, "balance_loss_mlp": 1.15613592, "diversity_loss_mlp": 0.0, "epoch": 0.07637552904963447, "flos": 698206708224.0, "grad_norm": 0.10401840603132484, "language_loss": 0.96742636, "learning_rate": 0.0009943733237665069, "loss": 0.97927213, "num_input_tokens_seen": 32764976, "router_z_loss_mlp": 0.28466797, "routerloss_mlp": 0.0, "step": 397, "time_per_iteration": 2.8282015323638916 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01204203, "balance_loss_mlp": 1.17542565, "diversity_loss_mlp": 0.0, "epoch": 0.07656791073489803, "flos": 579379682304.0, "grad_norm": 0.06433229599495933, "language_loss": 0.96130294, "learning_rate": 0.0009943266211704248, "loss": 0.97334492, "num_input_tokens_seen": 32853104, "router_z_loss_mlp": 0.28759766, "routerloss_mlp": 0.0, "step": 398, "time_per_iteration": 2.970426321029663 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01183998, "balance_loss_mlp": 1.15534043, "diversity_loss_mlp": 0.0, "epoch": 0.0767602924201616, "flos": 417145780224.0, "grad_norm": 0.08157022591406732, "language_loss": 0.98195136, "learning_rate": 0.000994279726659728, "loss": 0.99379134, "num_input_tokens_seen": 32919376, "router_z_loss_mlp": 0.28662109, "routerloss_mlp": 0.0, "step": 399, "time_per_iteration": 2.5123794078826904 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01177562, "balance_loss_mlp": 1.14926195, "diversity_loss_mlp": 0.0, "epoch": 0.07695267410542517, "flos": 482914109952.0, "grad_norm": 0.07895179134063258, "language_loss": 0.95376462, "learning_rate": 0.0009942326402526231, "loss": 0.96554029, "num_input_tokens_seen": 32988064, "router_z_loss_mlp": 0.28320312, "routerloss_mlp": 0.0, "step": 400, "time_per_iteration": 2.52349591255188 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01146856, "balance_loss_mlp": 1.11905658, "diversity_loss_mlp": 0.0, "epoch": 0.07714505579068873, "flos": 530998778880.0, "grad_norm": 0.0705701607591385, "language_loss": 0.94442534, "learning_rate": 0.0009941853619673902, "loss": 0.95589387, "num_input_tokens_seen": 33059024, "router_z_loss_mlp": 0.27807617, "routerloss_mlp": 0.0, "step": 401, "time_per_iteration": 2.643442153930664 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01134479, "balance_loss_mlp": 1.10811007, "diversity_loss_mlp": 0.0, "epoch": 0.07733743747595229, "flos": 804995490816.0, "grad_norm": 0.11619926948996102, "language_loss": 0.97199881, "learning_rate": 0.0009941378918223844, "loss": 0.9833436, "num_input_tokens_seen": 33137712, "router_z_loss_mlp": 0.26416016, "routerloss_mlp": 0.0, "step": 402, "time_per_iteration": 3.05241322517395 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124539, "balance_loss_mlp": 1.09765708, "diversity_loss_mlp": 0.0, "epoch": 0.07752981916121585, "flos": 622476016128.0, "grad_norm": 0.0628584922031364, "language_loss": 0.90586787, "learning_rate": 0.0009940902298360354, "loss": 0.91711324, "num_input_tokens_seen": 33211296, "router_z_loss_mlp": 0.26916504, "routerloss_mlp": 0.0, "step": 403, "time_per_iteration": 2.739593744277954 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01123125, "balance_loss_mlp": 1.09564674, "diversity_loss_mlp": 0.0, "epoch": 0.07772220084647942, "flos": 728276603904.0, "grad_norm": 0.07463467829204698, "language_loss": 0.99357891, "learning_rate": 0.0009940423760268473, "loss": 1.00481009, "num_input_tokens_seen": 33283632, "router_z_loss_mlp": 0.27478027, "routerloss_mlp": 0.0, "step": 404, "time_per_iteration": 2.863248825073242 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01123907, "balance_loss_mlp": 1.09644127, "diversity_loss_mlp": 0.0, "epoch": 0.07791458253174298, "flos": 555412972032.0, "grad_norm": 0.08544352707712408, "language_loss": 0.93046296, "learning_rate": 0.0009939943304133982, "loss": 0.94170201, "num_input_tokens_seen": 33350704, "router_z_loss_mlp": 0.27514648, "routerloss_mlp": 0.0, "step": 405, "time_per_iteration": 2.631242275238037 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00929276, "balance_loss_mlp": 1.55583501, "diversity_loss_mlp": 0.25816602, "epoch": 0.07810696421700654, "flos": 553181819904.0, "grad_norm": 0.039808149400508724, "language_loss": 1.0085814, "learning_rate": 0.0009939460930143416, "loss": 1.017874, "num_input_tokens_seen": 33416272, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.02227605, "step": 406, "time_per_iteration": 2.655000925064087 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00908113, "balance_loss_mlp": 1.5136435, "diversity_loss_mlp": 0.25845903, "epoch": 0.0782993459022701, "flos": 650633389056.0, "grad_norm": 0.031543409668047605, "language_loss": 0.94866949, "learning_rate": 0.0009938976638484043, "loss": 0.95775062, "num_input_tokens_seen": 33501824, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.02206134, "step": 407, "time_per_iteration": 2.932522773742676 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01125815, "balance_loss_mlp": 1.09954083, "diversity_loss_mlp": 0.0, "epoch": 0.07849172758753367, "flos": 496172542464.0, "grad_norm": 0.0874520562524596, "language_loss": 0.93291676, "learning_rate": 0.0009938490429343887, "loss": 0.94417489, "num_input_tokens_seen": 33571456, "router_z_loss_mlp": 0.26306152, "routerloss_mlp": 0.0, "step": 408, "time_per_iteration": 2.5488343238830566 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128514, "balance_loss_mlp": 1.10140562, "diversity_loss_mlp": 0.0, "epoch": 0.07868410927279723, "flos": 577971542016.0, "grad_norm": 0.1051667442879041, "language_loss": 0.94155729, "learning_rate": 0.0009938002302911709, "loss": 0.95284247, "num_input_tokens_seen": 33646320, "router_z_loss_mlp": 0.27148438, "routerloss_mlp": 0.0, "step": 409, "time_per_iteration": 2.7672979831695557 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136116, "balance_loss_mlp": 1.10946035, "diversity_loss_mlp": 0.0, "epoch": 0.07887649095806079, "flos": 522970960896.0, "grad_norm": 0.09613329153911296, "language_loss": 0.9601537, "learning_rate": 0.0009937512259377015, "loss": 0.97151482, "num_input_tokens_seen": 33717664, "router_z_loss_mlp": 0.26660156, "routerloss_mlp": 0.0, "step": 410, "time_per_iteration": 2.674072504043579 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01159019, "balance_loss_mlp": 1.13217306, "diversity_loss_mlp": 0.0, "epoch": 0.07906887264332435, "flos": 557253540864.0, "grad_norm": 0.05951235305386178, "language_loss": 0.95475662, "learning_rate": 0.000993702029893006, "loss": 0.96634674, "num_input_tokens_seen": 33794720, "router_z_loss_mlp": 0.26879883, "routerloss_mlp": 0.0, "step": 411, "time_per_iteration": 2.7913753986358643 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01185856, "balance_loss_mlp": 1.15731764, "diversity_loss_mlp": 0.0, "epoch": 0.07926125432858792, "flos": 821984993280.0, "grad_norm": 0.10961223184545879, "language_loss": 0.95336723, "learning_rate": 0.0009936526421761838, "loss": 0.96522582, "num_input_tokens_seen": 33868304, "router_z_loss_mlp": 0.28540039, "routerloss_mlp": 0.0, "step": 412, "time_per_iteration": 3.036557197570801 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01181446, "balance_loss_mlp": 1.15414703, "diversity_loss_mlp": 0.0, "epoch": 0.07945363601385148, "flos": 562336224768.0, "grad_norm": 0.09075853005030154, "language_loss": 0.97731507, "learning_rate": 0.000993603062806409, "loss": 0.98912954, "num_input_tokens_seen": 33937424, "router_z_loss_mlp": 0.27319336, "routerloss_mlp": 0.0, "step": 413, "time_per_iteration": 2.690500259399414 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01166438, "balance_loss_mlp": 1.1394248, "diversity_loss_mlp": 0.0, "epoch": 0.07964601769911504, "flos": 517868826624.0, "grad_norm": 0.0841151797190701, "language_loss": 1.00301099, "learning_rate": 0.0009935532918029298, "loss": 1.01467538, "num_input_tokens_seen": 34003984, "router_z_loss_mlp": 0.27050781, "routerloss_mlp": 0.0, "step": 414, "time_per_iteration": 2.6386477947235107 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01171646, "balance_loss_mlp": 1.14432323, "diversity_loss_mlp": 0.0, "epoch": 0.0798383993843786, "flos": 539224086528.0, "grad_norm": 0.07267589634089947, "language_loss": 0.94145483, "learning_rate": 0.0009935033291850694, "loss": 0.95317131, "num_input_tokens_seen": 34072400, "router_z_loss_mlp": 0.27307129, "routerloss_mlp": 0.0, "step": 415, "time_per_iteration": 2.6771326065063477 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138565, "balance_loss_mlp": 1.11312544, "diversity_loss_mlp": 0.0, "epoch": 0.08003078106964218, "flos": 485145262080.0, "grad_norm": 0.09244391725109519, "language_loss": 0.96404541, "learning_rate": 0.0009934531749722247, "loss": 0.97543103, "num_input_tokens_seen": 34142448, "router_z_loss_mlp": 0.25463867, "routerloss_mlp": 0.0, "step": 416, "time_per_iteration": 2.586975574493408 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01132517, "balance_loss_mlp": 1.10733998, "diversity_loss_mlp": 0.0, "epoch": 0.08022316275490574, "flos": 518254267392.0, "grad_norm": 0.0915153559751851, "language_loss": 0.94398224, "learning_rate": 0.0009934028291838672, "loss": 0.95530736, "num_input_tokens_seen": 34214080, "router_z_loss_mlp": 0.25183105, "routerloss_mlp": 0.0, "step": 417, "time_per_iteration": 2.7062928676605225 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01150706, "balance_loss_mlp": 1.1251713, "diversity_loss_mlp": 0.0, "epoch": 0.0804155444401693, "flos": 494012971008.0, "grad_norm": 0.10053131301435142, "language_loss": 0.89968443, "learning_rate": 0.0009933522918395433, "loss": 0.91119152, "num_input_tokens_seen": 34288448, "router_z_loss_mlp": 0.25549316, "routerloss_mlp": 0.0, "step": 418, "time_per_iteration": 2.65326189994812 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00760745, "balance_loss_mlp": 1.16580379, "diversity_loss_mlp": 0.256477, "epoch": 0.08060792612543285, "flos": 1581422455296.0, "grad_norm": 0.006992447528439397, "language_loss": 0.782511, "learning_rate": 0.0009933015629588731, "loss": 0.79011846, "num_input_tokens_seen": 34521632, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.049605, "step": 419, "time_per_iteration": 4.8772523403167725 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01176473, "balance_loss_mlp": 1.15143883, "diversity_loss_mlp": 0.0, "epoch": 0.08080030781069643, "flos": 525343076352.0, "grad_norm": 0.08608768077535772, "language_loss": 1.07860529, "learning_rate": 0.000993250642561551, "loss": 1.09036994, "num_input_tokens_seen": 34590080, "router_z_loss_mlp": 0.25061035, "routerloss_mlp": 0.0, "step": 420, "time_per_iteration": 2.588672399520874 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01176613, "balance_loss_mlp": 1.15165043, "diversity_loss_mlp": 0.0, "epoch": 0.08099268949595999, "flos": 546718159872.0, "grad_norm": 0.09804047271530963, "language_loss": 0.93524832, "learning_rate": 0.0009931995306673466, "loss": 0.94701445, "num_input_tokens_seen": 34660512, "router_z_loss_mlp": 0.24951172, "routerloss_mlp": 0.0, "step": 421, "time_per_iteration": 2.734513521194458 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01200943, "balance_loss_mlp": 1.17474103, "diversity_loss_mlp": 0.0, "epoch": 0.08118507118122355, "flos": 510367412736.0, "grad_norm": 0.0768650968130289, "language_loss": 0.98959565, "learning_rate": 0.000993148227296103, "loss": 1.00160503, "num_input_tokens_seen": 34732016, "router_z_loss_mlp": 0.26245117, "routerloss_mlp": 0.0, "step": 422, "time_per_iteration": 2.6389012336730957 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01185361, "balance_loss_mlp": 1.1604228, "diversity_loss_mlp": 0.0, "epoch": 0.08137745286648711, "flos": 720671302656.0, "grad_norm": 0.08220754838372611, "language_loss": 0.87845761, "learning_rate": 0.000993096732467738, "loss": 0.89031118, "num_input_tokens_seen": 34810416, "router_z_loss_mlp": 0.24938965, "routerloss_mlp": 0.0, "step": 423, "time_per_iteration": 2.976412057876587 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00884908, "balance_loss_mlp": 1.45653749, "diversity_loss_mlp": 0.26738948, "epoch": 0.08156983455175067, "flos": 679613773824.0, "grad_norm": 0.04326164577840749, "language_loss": 0.94753903, "learning_rate": 0.0009930450462022435, "loss": 0.95638812, "num_input_tokens_seen": 34879504, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.02294483, "step": 424, "time_per_iteration": 2.9038002490997314 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02462639, "balance_loss_mlp": 2.35582733, "diversity_loss_mlp": 0.0, "epoch": 0.08176221623701424, "flos": 1453377157632.0, "grad_norm": 0.15208391867633483, "language_loss": 0.79189807, "learning_rate": 0.0009929931685196862, "loss": 0.81652445, "num_input_tokens_seen": 35111584, "router_z_loss_mlp": 1.0703125, "routerloss_mlp": 0.0, "step": 425, "time_per_iteration": 4.893689155578613 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01182525, "balance_loss_mlp": 1.15690684, "diversity_loss_mlp": 0.0, "epoch": 0.0819545979222778, "flos": 1556602292736.0, "grad_norm": 0.10181541083425144, "language_loss": 0.92197704, "learning_rate": 0.0009929410994402065, "loss": 0.93380231, "num_input_tokens_seen": 35205664, "router_z_loss_mlp": 0.25646973, "routerloss_mlp": 0.0, "step": 426, "time_per_iteration": 3.793488025665283 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00863772, "balance_loss_mlp": 1.42266524, "diversity_loss_mlp": 0.26325443, "epoch": 0.08214697960754136, "flos": 512724473856.0, "grad_norm": 0.038163151149059646, "language_loss": 0.97185421, "learning_rate": 0.0009928888389840196, "loss": 0.98049194, "num_input_tokens_seen": 35280144, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.02081174, "step": 427, "time_per_iteration": 2.7310097217559814 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01196199, "balance_loss_mlp": 1.1708436, "diversity_loss_mlp": 0.0, "epoch": 0.08233936129280492, "flos": 595124029440.0, "grad_norm": 0.1014811860289813, "language_loss": 0.98936689, "learning_rate": 0.0009928363871714147, "loss": 1.00132895, "num_input_tokens_seen": 35344768, "router_z_loss_mlp": 0.25378418, "routerloss_mlp": 0.0, "step": 428, "time_per_iteration": 2.650698184967041 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01198239, "balance_loss_mlp": 1.17194164, "diversity_loss_mlp": 0.0, "epoch": 0.08253174297806849, "flos": 572039055360.0, "grad_norm": 0.0884548399202502, "language_loss": 0.93840969, "learning_rate": 0.0009927837440227556, "loss": 0.95039201, "num_input_tokens_seen": 35425536, "router_z_loss_mlp": 0.26306152, "routerloss_mlp": 0.0, "step": 429, "time_per_iteration": 2.8162689208984375 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01199498, "balance_loss_mlp": 1.17399931, "diversity_loss_mlp": 0.0, "epoch": 0.08272412466333205, "flos": 623380147200.0, "grad_norm": 0.0660726649824177, "language_loss": 0.88846099, "learning_rate": 0.0009927309095584798, "loss": 0.90045595, "num_input_tokens_seen": 35515440, "router_z_loss_mlp": 0.25524902, "routerloss_mlp": 0.0, "step": 430, "time_per_iteration": 2.975594997406006 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01190829, "balance_loss_mlp": 1.1661284, "diversity_loss_mlp": 0.0, "epoch": 0.08291650634859561, "flos": 513994595328.0, "grad_norm": 0.08430379744466543, "language_loss": 0.98639262, "learning_rate": 0.0009926778837991, "loss": 0.99830091, "num_input_tokens_seen": 35580192, "router_z_loss_mlp": 0.24682617, "routerloss_mlp": 0.0, "step": 431, "time_per_iteration": 2.595855236053467 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01187757, "balance_loss_mlp": 1.16231799, "diversity_loss_mlp": 0.0, "epoch": 0.08310888803385917, "flos": 667365931008.0, "grad_norm": 0.08045199303169787, "language_loss": 0.97297168, "learning_rate": 0.000992624666765202, "loss": 0.98484921, "num_input_tokens_seen": 35649472, "router_z_loss_mlp": 0.2545166, "routerloss_mlp": 0.0, "step": 432, "time_per_iteration": 2.828488826751709 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01195331, "balance_loss_mlp": 1.17080951, "diversity_loss_mlp": 0.0, "epoch": 0.08330126971912274, "flos": 583293560832.0, "grad_norm": 0.08518069864439091, "language_loss": 0.9513936, "learning_rate": 0.000992571258477447, "loss": 0.96334684, "num_input_tokens_seen": 35722848, "router_z_loss_mlp": 0.24536133, "routerloss_mlp": 0.0, "step": 433, "time_per_iteration": 2.7914628982543945 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01181479, "balance_loss_mlp": 1.15727913, "diversity_loss_mlp": 0.0, "epoch": 0.0834936514043863, "flos": 561350227968.0, "grad_norm": 0.08514456826718247, "language_loss": 0.89393032, "learning_rate": 0.0009925176589565695, "loss": 0.90574509, "num_input_tokens_seen": 35800944, "router_z_loss_mlp": 0.24182129, "routerloss_mlp": 0.0, "step": 434, "time_per_iteration": 2.847381830215454 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01154073, "balance_loss_mlp": 1.13002813, "diversity_loss_mlp": 0.0, "epoch": 0.08368603308964986, "flos": 494519551488.0, "grad_norm": 0.09497783603336436, "language_loss": 0.99263078, "learning_rate": 0.0009924638682233791, "loss": 1.00417161, "num_input_tokens_seen": 35866288, "router_z_loss_mlp": 0.24047852, "routerloss_mlp": 0.0, "step": 435, "time_per_iteration": 2.5871427059173584 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02505725, "balance_loss_mlp": 2.43934894, "diversity_loss_mlp": 0.0, "epoch": 0.08387841477491342, "flos": 1389017714688.0, "grad_norm": 0.06827578128022488, "language_loss": 0.79564589, "learning_rate": 0.0009924098862987589, "loss": 0.82070321, "num_input_tokens_seen": 36083040, "router_z_loss_mlp": 0.6640625, "routerloss_mlp": 0.0, "step": 436, "time_per_iteration": 4.539026737213135 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138708, "balance_loss_mlp": 1.11440182, "diversity_loss_mlp": 0.0, "epoch": 0.084070796460177, "flos": 798984082944.0, "grad_norm": 0.10357837156718612, "language_loss": 0.8856501, "learning_rate": 0.0009923557132036668, "loss": 0.89703721, "num_input_tokens_seen": 36158816, "router_z_loss_mlp": 0.24304199, "routerloss_mlp": 0.0, "step": 437, "time_per_iteration": 3.0414698123931885 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124752, "balance_loss_mlp": 1.09998, "diversity_loss_mlp": 0.0, "epoch": 0.08426317814544056, "flos": 558963431424.0, "grad_norm": 0.06660243724344939, "language_loss": 0.94103611, "learning_rate": 0.0009923013489591345, "loss": 0.95228368, "num_input_tokens_seen": 36236432, "router_z_loss_mlp": 0.24768066, "routerloss_mlp": 0.0, "step": 438, "time_per_iteration": 2.7426626682281494 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00857144, "balance_loss_mlp": 1.4199276, "diversity_loss_mlp": 0.26049304, "epoch": 0.08445555983070412, "flos": 810421396992.0, "grad_norm": 0.04620678173721227, "language_loss": 0.92873847, "learning_rate": 0.0009922467935862681, "loss": 0.93730992, "num_input_tokens_seen": 36327952, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01693399, "step": 439, "time_per_iteration": 3.107149124145508 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0113851, "balance_loss_mlp": 1.11386943, "diversity_loss_mlp": 0.0, "epoch": 0.08464794151596768, "flos": 510184604160.0, "grad_norm": 0.07763968648184205, "language_loss": 0.95120305, "learning_rate": 0.0009921920471062478, "loss": 0.96258819, "num_input_tokens_seen": 36394896, "router_z_loss_mlp": 0.24633789, "routerloss_mlp": 0.0, "step": 440, "time_per_iteration": 2.572195529937744 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01139984, "balance_loss_mlp": 1.11489022, "diversity_loss_mlp": 0.0, "epoch": 0.08484032320123125, "flos": 556413649920.0, "grad_norm": 0.0880262953369173, "language_loss": 0.92829931, "learning_rate": 0.0009921371095403281, "loss": 0.93969917, "num_input_tokens_seen": 36464656, "router_z_loss_mlp": 0.25109863, "routerloss_mlp": 0.0, "step": 441, "time_per_iteration": 2.6386919021606445 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01156684, "balance_loss_mlp": 1.13206697, "diversity_loss_mlp": 0.0, "epoch": 0.08503270488649481, "flos": 527354343936.0, "grad_norm": 0.09427081021892933, "language_loss": 0.95792937, "learning_rate": 0.0009920819809098379, "loss": 0.96949625, "num_input_tokens_seen": 36532208, "router_z_loss_mlp": 0.24633789, "routerloss_mlp": 0.0, "step": 442, "time_per_iteration": 2.588674783706665 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01169571, "balance_loss_mlp": 1.1441319, "diversity_loss_mlp": 0.0, "epoch": 0.08522508657175837, "flos": 614267960832.0, "grad_norm": 0.0873536117240321, "language_loss": 0.91373646, "learning_rate": 0.0009920266612361798, "loss": 0.92543221, "num_input_tokens_seen": 36607360, "router_z_loss_mlp": 0.25463867, "routerloss_mlp": 0.0, "step": 443, "time_per_iteration": 2.755526065826416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01167225, "balance_loss_mlp": 1.14349055, "diversity_loss_mlp": 0.0, "epoch": 0.08541746825702193, "flos": 619792611840.0, "grad_norm": 0.07116177044877865, "language_loss": 0.90907955, "learning_rate": 0.0009919711505408308, "loss": 0.92075175, "num_input_tokens_seen": 36680688, "router_z_loss_mlp": 0.23718262, "routerloss_mlp": 0.0, "step": 444, "time_per_iteration": 2.7939865589141846 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0116645, "balance_loss_mlp": 1.14170241, "diversity_loss_mlp": 0.0, "epoch": 0.08560984994228549, "flos": 482914109952.0, "grad_norm": 0.09221719775958219, "language_loss": 0.89192301, "learning_rate": 0.000991915448845342, "loss": 0.90358752, "num_input_tokens_seen": 36746288, "router_z_loss_mlp": 0.24731445, "routerloss_mlp": 0.0, "step": 445, "time_per_iteration": 2.5457842350006104 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01154656, "balance_loss_mlp": 1.13168466, "diversity_loss_mlp": 0.0, "epoch": 0.08580223162754906, "flos": 517152273408.0, "grad_norm": 0.08780021998431992, "language_loss": 0.98329008, "learning_rate": 0.000991859556171339, "loss": 0.99483669, "num_input_tokens_seen": 36812528, "router_z_loss_mlp": 0.22973633, "routerloss_mlp": 0.0, "step": 446, "time_per_iteration": 2.6356756687164307 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0083848, "balance_loss_mlp": 1.38336182, "diversity_loss_mlp": 0.25472927, "epoch": 0.08599461331281262, "flos": 531475623936.0, "grad_norm": 0.049564893991705376, "language_loss": 1.00050902, "learning_rate": 0.000991803472540521, "loss": 1.00889397, "num_input_tokens_seen": 36879248, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01943407, "step": 447, "time_per_iteration": 2.631704807281494 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130193, "balance_loss_mlp": 1.1087712, "diversity_loss_mlp": 0.0, "epoch": 0.08618699499807618, "flos": 790299182592.0, "grad_norm": 0.11682082282160788, "language_loss": 0.94917679, "learning_rate": 0.0009917471979746615, "loss": 0.96047872, "num_input_tokens_seen": 36951376, "router_z_loss_mlp": 0.21435547, "routerloss_mlp": 0.0, "step": 448, "time_per_iteration": 2.9820516109466553 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01122722, "balance_loss_mlp": 1.10119319, "diversity_loss_mlp": 0.0, "epoch": 0.08637937668333974, "flos": 565988000256.0, "grad_norm": 0.07207820272739716, "language_loss": 0.94521272, "learning_rate": 0.0009916907324956086, "loss": 0.95643997, "num_input_tokens_seen": 37025936, "router_z_loss_mlp": 0.21533203, "routerloss_mlp": 0.0, "step": 449, "time_per_iteration": 2.701571464538574 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01127326, "balance_loss_mlp": 1.10453379, "diversity_loss_mlp": 0.0, "epoch": 0.08657175836860331, "flos": 445167332352.0, "grad_norm": 0.081693490118891, "language_loss": 0.90889072, "learning_rate": 0.0009916340761252837, "loss": 0.92016399, "num_input_tokens_seen": 37095872, "router_z_loss_mlp": 0.2277832, "routerloss_mlp": 0.0, "step": 450, "time_per_iteration": 2.598238945007324 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124635, "balance_loss_mlp": 1.10287929, "diversity_loss_mlp": 0.0, "epoch": 0.08676414005386687, "flos": 844148210688.0, "grad_norm": 0.08322873762038852, "language_loss": 0.88526833, "learning_rate": 0.0009915772288856832, "loss": 0.89651471, "num_input_tokens_seen": 37179072, "router_z_loss_mlp": 0.21765137, "routerloss_mlp": 0.0, "step": 451, "time_per_iteration": 3.0680441856384277 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121178, "balance_loss_mlp": 1.09876692, "diversity_loss_mlp": 0.0, "epoch": 0.08695652173913043, "flos": 603292437504.0, "grad_norm": 0.07764148626601892, "language_loss": 0.8994481, "learning_rate": 0.000991520190798877, "loss": 0.91065991, "num_input_tokens_seen": 37260288, "router_z_loss_mlp": 0.22412109, "routerloss_mlp": 0.0, "step": 452, "time_per_iteration": 2.7982983589172363 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136254, "balance_loss_mlp": 1.11281788, "diversity_loss_mlp": 0.0, "epoch": 0.08714890342439399, "flos": 730737552384.0, "grad_norm": 0.11496723003988224, "language_loss": 0.98584056, "learning_rate": 0.0009914629618870089, "loss": 0.99720311, "num_input_tokens_seen": 37331136, "router_z_loss_mlp": 0.23425293, "routerloss_mlp": 0.0, "step": 453, "time_per_iteration": 2.8737423419952393 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0218934, "balance_loss_mlp": 2.1624465, "diversity_loss_mlp": 0.0, "epoch": 0.08734128510965757, "flos": 1482303214080.0, "grad_norm": 0.09249743450545506, "language_loss": 0.78675872, "learning_rate": 0.0009914055421722976, "loss": 0.8086521, "num_input_tokens_seen": 37559040, "router_z_loss_mlp": 0.26953125, "routerloss_mlp": 0.0, "step": 454, "time_per_iteration": 4.756322860717773 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02065274, "balance_loss_mlp": 2.03780842, "diversity_loss_mlp": 0.0, "epoch": 0.08753366679492113, "flos": 1523022289920.0, "grad_norm": 0.0744981683452351, "language_loss": 0.81427962, "learning_rate": 0.0009913479316770353, "loss": 0.83493233, "num_input_tokens_seen": 37785136, "router_z_loss_mlp": 0.27539062, "routerloss_mlp": 0.0, "step": 455, "time_per_iteration": 2.173584461212158 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00848454, "balance_loss_mlp": 1.40727437, "diversity_loss_mlp": 0.24745712, "epoch": 0.08772604848018468, "flos": 721252035072.0, "grad_norm": 0.04702924064086775, "language_loss": 0.92085564, "learning_rate": 0.0009912901304235883, "loss": 0.92934018, "num_input_tokens_seen": 37858832, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0210887, "step": 456, "time_per_iteration": 2.868276596069336 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01273346, "balance_loss_mlp": 1.24886012, "diversity_loss_mlp": 0.0, "epoch": 0.08791843016544824, "flos": 708233310720.0, "grad_norm": 0.1518400720273604, "language_loss": 0.87943619, "learning_rate": 0.000991232138434397, "loss": 0.89216965, "num_input_tokens_seen": 37931856, "router_z_loss_mlp": 0.24499512, "routerloss_mlp": 0.0, "step": 457, "time_per_iteration": 2.8729381561279297 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01262571, "balance_loss_mlp": 1.23763299, "diversity_loss_mlp": 0.0, "epoch": 0.08811081185071182, "flos": 473043151872.0, "grad_norm": 0.14470377187588201, "language_loss": 0.94336045, "learning_rate": 0.000991173955731976, "loss": 0.9559862, "num_input_tokens_seen": 38002432, "router_z_loss_mlp": 0.24951172, "routerloss_mlp": 0.0, "step": 458, "time_per_iteration": 2.7100729942321777 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01218734, "balance_loss_mlp": 1.19520259, "diversity_loss_mlp": 0.0, "epoch": 0.08830319353597538, "flos": 684980209152.0, "grad_norm": 0.09239254139658798, "language_loss": 0.99845707, "learning_rate": 0.0009911155823389137, "loss": 1.01064444, "num_input_tokens_seen": 38081648, "router_z_loss_mlp": 0.23547363, "routerloss_mlp": 0.0, "step": 459, "time_per_iteration": 2.9462080001831055 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01178782, "balance_loss_mlp": 1.1555717, "diversity_loss_mlp": 0.0, "epoch": 0.08849557522123894, "flos": 573509237760.0, "grad_norm": 0.0878830171329016, "language_loss": 0.95269191, "learning_rate": 0.000991057018277873, "loss": 0.9644798, "num_input_tokens_seen": 38153424, "router_z_loss_mlp": 0.23205566, "routerloss_mlp": 0.0, "step": 460, "time_per_iteration": 2.7473583221435547 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01151252, "balance_loss_mlp": 1.12904322, "diversity_loss_mlp": 0.0, "epoch": 0.0886879569065025, "flos": 564567376896.0, "grad_norm": 0.1205367347306004, "language_loss": 0.9509443, "learning_rate": 0.0009909982635715898, "loss": 0.96245682, "num_input_tokens_seen": 38223008, "router_z_loss_mlp": 0.22216797, "routerloss_mlp": 0.0, "step": 461, "time_per_iteration": 2.6226725578308105 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01145607, "balance_loss_mlp": 1.12300491, "diversity_loss_mlp": 0.0, "epoch": 0.08888033859176607, "flos": 563877987840.0, "grad_norm": 0.0884001914091671, "language_loss": 0.94182885, "learning_rate": 0.0009909393182428751, "loss": 0.95328492, "num_input_tokens_seen": 38294592, "router_z_loss_mlp": 0.22619629, "routerloss_mlp": 0.0, "step": 462, "time_per_iteration": 2.632216453552246 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01157329, "balance_loss_mlp": 1.13402367, "diversity_loss_mlp": 0.0, "epoch": 0.08907272027702963, "flos": 465761622528.0, "grad_norm": 0.09814328047414513, "language_loss": 0.89072084, "learning_rate": 0.000990880182314614, "loss": 0.90229416, "num_input_tokens_seen": 38365792, "router_z_loss_mlp": 0.23291016, "routerloss_mlp": 0.0, "step": 463, "time_per_iteration": 2.6763410568237305 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.008652, "balance_loss_mlp": 1.44467092, "diversity_loss_mlp": 0.24997658, "epoch": 0.08926510196229319, "flos": 681528494592.0, "grad_norm": 0.034550824680377484, "language_loss": 0.89998591, "learning_rate": 0.0009908208558097643, "loss": 0.90863788, "num_input_tokens_seen": 38447776, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01787652, "step": 464, "time_per_iteration": 2.9323060512542725 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01224446, "balance_loss_mlp": 1.20036614, "diversity_loss_mlp": 0.0, "epoch": 0.08945748364755675, "flos": 596692956672.0, "grad_norm": 0.11121459240038054, "language_loss": 0.9153899, "learning_rate": 0.000990761338751359, "loss": 0.92763436, "num_input_tokens_seen": 38521632, "router_z_loss_mlp": 0.24072266, "routerloss_mlp": 0.0, "step": 465, "time_per_iteration": 2.7976956367492676 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01887012, "balance_loss_mlp": 1.84867477, "diversity_loss_mlp": 0.0, "epoch": 0.08964986533282032, "flos": 1585931747328.0, "grad_norm": 0.10155840838291885, "language_loss": 0.73659623, "learning_rate": 0.0009907016311625045, "loss": 0.75546634, "num_input_tokens_seen": 38760528, "router_z_loss_mlp": 0.3828125, "routerloss_mlp": 0.0, "step": 466, "time_per_iteration": 4.965139150619507 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01319273, "balance_loss_mlp": 1.29344034, "diversity_loss_mlp": 0.0, "epoch": 0.08984224701808388, "flos": 533523967488.0, "grad_norm": 0.10901527230577203, "language_loss": 0.93872285, "learning_rate": 0.0009906417330663815, "loss": 0.95191562, "num_input_tokens_seen": 38827200, "router_z_loss_mlp": 0.25866699, "routerloss_mlp": 0.0, "step": 467, "time_per_iteration": 2.628042459487915 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01352641, "balance_loss_mlp": 1.3264153, "diversity_loss_mlp": 0.0, "epoch": 0.09003462870334744, "flos": 478931222016.0, "grad_norm": 0.10051526680757361, "language_loss": 0.90321958, "learning_rate": 0.0009905816444862442, "loss": 0.91674596, "num_input_tokens_seen": 38891984, "router_z_loss_mlp": 0.26245117, "routerloss_mlp": 0.0, "step": 468, "time_per_iteration": 2.613952398300171 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01396274, "balance_loss_mlp": 1.36905813, "diversity_loss_mlp": 0.0, "epoch": 0.090227010388611, "flos": 653625510912.0, "grad_norm": 0.10220310656667285, "language_loss": 0.88433367, "learning_rate": 0.0009905213654454216, "loss": 0.89829642, "num_input_tokens_seen": 38977136, "router_z_loss_mlp": 0.27209473, "routerloss_mlp": 0.0, "step": 469, "time_per_iteration": 2.897365093231201 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01363851, "balance_loss_mlp": 1.3367548, "diversity_loss_mlp": 0.0, "epoch": 0.09041939207387456, "flos": 618186608640.0, "grad_norm": 0.11223211494597432, "language_loss": 0.94907629, "learning_rate": 0.0009904608959673158, "loss": 0.96271479, "num_input_tokens_seen": 39052224, "router_z_loss_mlp": 0.2713623, "routerloss_mlp": 0.0, "step": 470, "time_per_iteration": 2.7828967571258545 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01328731, "balance_loss_mlp": 1.30289829, "diversity_loss_mlp": 0.0, "epoch": 0.09061177375913813, "flos": 454368724992.0, "grad_norm": 0.10534875872888719, "language_loss": 0.94143116, "learning_rate": 0.000990400236075403, "loss": 0.95471847, "num_input_tokens_seen": 39116832, "router_z_loss_mlp": 0.25866699, "routerloss_mlp": 0.0, "step": 471, "time_per_iteration": 2.5291385650634766 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0126, "balance_loss_mlp": 1.23546696, "diversity_loss_mlp": 0.0, "epoch": 0.0908041554444017, "flos": 544247299584.0, "grad_norm": 0.08150240013734093, "language_loss": 0.92401147, "learning_rate": 0.0009903393857932338, "loss": 0.93661153, "num_input_tokens_seen": 39190528, "router_z_loss_mlp": 0.24536133, "routerloss_mlp": 0.0, "step": 472, "time_per_iteration": 2.6317975521087646 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01234666, "balance_loss_mlp": 1.21105075, "diversity_loss_mlp": 0.0, "epoch": 0.09099653712966525, "flos": 564335009280.0, "grad_norm": 0.1079858906687858, "language_loss": 0.89742762, "learning_rate": 0.0009902783451444317, "loss": 0.90977424, "num_input_tokens_seen": 39263168, "router_z_loss_mlp": 0.23583984, "routerloss_mlp": 0.0, "step": 473, "time_per_iteration": 2.708159923553467 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01204783, "balance_loss_mlp": 1.18326581, "diversity_loss_mlp": 0.0, "epoch": 0.09118891881492881, "flos": 474540498432.0, "grad_norm": 0.08561107807714156, "language_loss": 0.94620812, "learning_rate": 0.0009902171141526956, "loss": 0.95825595, "num_input_tokens_seen": 39330784, "router_z_loss_mlp": 0.21533203, "routerloss_mlp": 0.0, "step": 474, "time_per_iteration": 2.5238943099975586 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01196875, "balance_loss_mlp": 1.17460644, "diversity_loss_mlp": 0.0, "epoch": 0.09138130050019239, "flos": 545860643328.0, "grad_norm": 0.10745755704500252, "language_loss": 0.82875264, "learning_rate": 0.000990155692841797, "loss": 0.84072143, "num_input_tokens_seen": 39417472, "router_z_loss_mlp": 0.22277832, "routerloss_mlp": 0.0, "step": 475, "time_per_iteration": 2.985820770263672 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01191147, "balance_loss_mlp": 1.16911697, "diversity_loss_mlp": 0.0, "epoch": 0.09157368218545595, "flos": 732711744000.0, "grad_norm": 0.10692573165988825, "language_loss": 0.93685389, "learning_rate": 0.0009900940812355818, "loss": 0.9487654, "num_input_tokens_seen": 39488656, "router_z_loss_mlp": 0.22033691, "routerloss_mlp": 0.0, "step": 476, "time_per_iteration": 2.882946014404297 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01182015, "balance_loss_mlp": 1.15972316, "diversity_loss_mlp": 0.0, "epoch": 0.0917660638707195, "flos": 610981802496.0, "grad_norm": 0.15748592495925862, "language_loss": 0.89566875, "learning_rate": 0.00099003227935797, "loss": 0.90748894, "num_input_tokens_seen": 39558224, "router_z_loss_mlp": 0.22290039, "routerloss_mlp": 0.0, "step": 477, "time_per_iteration": 2.729729413986206 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01176422, "balance_loss_mlp": 1.15324748, "diversity_loss_mlp": 0.0, "epoch": 0.09195844555598306, "flos": 655851893760.0, "grad_norm": 0.11223041806675033, "language_loss": 0.92644513, "learning_rate": 0.000989970287232955, "loss": 0.93820935, "num_input_tokens_seen": 39629856, "router_z_loss_mlp": 0.23156738, "routerloss_mlp": 0.0, "step": 478, "time_per_iteration": 2.770315647125244 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01168241, "balance_loss_mlp": 1.14524555, "diversity_loss_mlp": 0.0, "epoch": 0.09215082724124664, "flos": 476578930176.0, "grad_norm": 0.08330283562574453, "language_loss": 0.90444613, "learning_rate": 0.0009899081048846043, "loss": 0.91612852, "num_input_tokens_seen": 39695984, "router_z_loss_mlp": 0.2298584, "routerloss_mlp": 0.0, "step": 479, "time_per_iteration": 2.548454523086548 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01230508, "balance_loss_mlp": 1.20630884, "diversity_loss_mlp": 0.0, "epoch": 0.0923432089265102, "flos": 524305322496.0, "grad_norm": 0.17103007353978975, "language_loss": 0.94793594, "learning_rate": 0.0009898457323370593, "loss": 0.96024096, "num_input_tokens_seen": 39760256, "router_z_loss_mlp": 0.24206543, "routerloss_mlp": 0.0, "step": 480, "time_per_iteration": 2.582655668258667 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01249007, "balance_loss_mlp": 1.22349596, "diversity_loss_mlp": 0.0, "epoch": 0.09253559061177376, "flos": 545569178112.0, "grad_norm": 0.11976742763400251, "language_loss": 0.9370476, "learning_rate": 0.000989783169614535, "loss": 0.94953763, "num_input_tokens_seen": 39827984, "router_z_loss_mlp": 0.25537109, "routerloss_mlp": 0.0, "step": 481, "time_per_iteration": 2.6305787563323975 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01772239, "balance_loss_mlp": 1.74649, "diversity_loss_mlp": 0.0, "epoch": 0.09272797229703732, "flos": 1538042370048.0, "grad_norm": 0.0876770513617693, "language_loss": 0.78752756, "learning_rate": 0.0009897204167413206, "loss": 0.80524993, "num_input_tokens_seen": 40056688, "router_z_loss_mlp": 0.2578125, "routerloss_mlp": 0.0, "step": 482, "time_per_iteration": 4.8690409660339355 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01276229, "balance_loss_mlp": 1.25084925, "diversity_loss_mlp": 0.0, "epoch": 0.09292035398230089, "flos": 689813273088.0, "grad_norm": 0.10686208189243855, "language_loss": 0.91100538, "learning_rate": 0.000989657473741779, "loss": 0.92376775, "num_input_tokens_seen": 40133120, "router_z_loss_mlp": 0.25402832, "routerloss_mlp": 0.0, "step": 483, "time_per_iteration": 2.8294553756713867 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01275465, "balance_loss_mlp": 1.25022864, "diversity_loss_mlp": 0.0, "epoch": 0.09311273566756445, "flos": 509749604352.0, "grad_norm": 0.09087050091564236, "language_loss": 0.92375994, "learning_rate": 0.0009895943406403465, "loss": 0.93651462, "num_input_tokens_seen": 40206464, "router_z_loss_mlp": 0.25244141, "routerloss_mlp": 0.0, "step": 484, "time_per_iteration": 2.728445053100586 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01231643, "balance_loss_mlp": 1.20584655, "diversity_loss_mlp": 0.0, "epoch": 0.09330511735282801, "flos": 659404924416.0, "grad_norm": 0.11173906110031175, "language_loss": 0.85102737, "learning_rate": 0.0009895310174615338, "loss": 0.86334383, "num_input_tokens_seen": 40277744, "router_z_loss_mlp": 0.25805664, "routerloss_mlp": 0.0, "step": 485, "time_per_iteration": 2.809858560562134 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01674879, "balance_loss_mlp": 1.65122819, "diversity_loss_mlp": 0.0, "epoch": 0.09349749903809157, "flos": 1452845984256.0, "grad_norm": 0.0891862493938321, "language_loss": 0.75718516, "learning_rate": 0.0009894675042299251, "loss": 0.77393395, "num_input_tokens_seen": 40503664, "router_z_loss_mlp": 0.23632812, "routerloss_mlp": 0.0, "step": 486, "time_per_iteration": 4.675356388092041 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01149968, "balance_loss_mlp": 1.1268059, "diversity_loss_mlp": 0.0, "epoch": 0.09368988072335514, "flos": 520870860288.0, "grad_norm": 0.12873710921953274, "language_loss": 0.89867461, "learning_rate": 0.0009894038009701782, "loss": 0.91017425, "num_input_tokens_seen": 40571376, "router_z_loss_mlp": 0.23168945, "routerloss_mlp": 0.0, "step": 487, "time_per_iteration": 2.646655797958374 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01141338, "balance_loss_mlp": 1.11786556, "diversity_loss_mlp": 0.0, "epoch": 0.0938822624086187, "flos": 497751381504.0, "grad_norm": 0.11717214663903742, "language_loss": 0.89069557, "learning_rate": 0.0009893399077070253, "loss": 0.90210891, "num_input_tokens_seen": 40638096, "router_z_loss_mlp": 0.23474121, "routerloss_mlp": 0.0, "step": 488, "time_per_iteration": 2.578733444213867 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00936332, "balance_loss_mlp": 1.59238243, "diversity_loss_mlp": 0.24211329, "epoch": 0.09407464409388226, "flos": 533202766848.0, "grad_norm": 0.03786592480343135, "language_loss": 0.88446009, "learning_rate": 0.0009892758244652718, "loss": 0.89382339, "num_input_tokens_seen": 40710992, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0190843, "step": 489, "time_per_iteration": 2.72853946685791 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01131365, "balance_loss_mlp": 1.10876274, "diversity_loss_mlp": 0.0, "epoch": 0.09426702577914582, "flos": 586006700544.0, "grad_norm": 0.09957245788293691, "language_loss": 0.92780352, "learning_rate": 0.0009892115512697968, "loss": 0.93911719, "num_input_tokens_seen": 40778896, "router_z_loss_mlp": 0.22583008, "routerloss_mlp": 0.0, "step": 490, "time_per_iteration": 2.6975181102752686 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01127367, "balance_loss_mlp": 1.10648203, "diversity_loss_mlp": 0.0, "epoch": 0.0944594074644094, "flos": 503357524992.0, "grad_norm": 0.09077239739165983, "language_loss": 0.95311546, "learning_rate": 0.0009891470881455537, "loss": 0.96438909, "num_input_tokens_seen": 40853376, "router_z_loss_mlp": 0.2088623, "routerloss_mlp": 0.0, "step": 491, "time_per_iteration": 2.674140214920044 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01141034, "balance_loss_mlp": 1.12092364, "diversity_loss_mlp": 0.0, "epoch": 0.09465178914967295, "flos": 571021125120.0, "grad_norm": 0.08843271909801863, "language_loss": 0.91967297, "learning_rate": 0.0009890824351175692, "loss": 0.93108326, "num_input_tokens_seen": 40923776, "router_z_loss_mlp": 0.20092773, "routerloss_mlp": 0.0, "step": 492, "time_per_iteration": 2.689789295196533 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01148763, "balance_loss_mlp": 1.12847304, "diversity_loss_mlp": 0.0, "epoch": 0.09484417083493651, "flos": 549361916928.0, "grad_norm": 0.0818574716555875, "language_loss": 0.96715915, "learning_rate": 0.0009890175922109435, "loss": 0.97864676, "num_input_tokens_seen": 40996848, "router_z_loss_mlp": 0.20288086, "routerloss_mlp": 0.0, "step": 493, "time_per_iteration": 2.653787136077881 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01161837, "balance_loss_mlp": 1.14108253, "diversity_loss_mlp": 0.0, "epoch": 0.09503655252020007, "flos": 823894944768.0, "grad_norm": 0.10785532679009643, "language_loss": 0.94627249, "learning_rate": 0.0009889525594508513, "loss": 0.95789087, "num_input_tokens_seen": 41071280, "router_z_loss_mlp": 0.20751953, "routerloss_mlp": 0.0, "step": 494, "time_per_iteration": 3.013289213180542 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01168804, "balance_loss_mlp": 1.14887238, "diversity_loss_mlp": 0.0, "epoch": 0.09522893420546363, "flos": 404621153280.0, "grad_norm": 0.09313196509024183, "language_loss": 0.89226812, "learning_rate": 0.0009888873368625404, "loss": 0.90395617, "num_input_tokens_seen": 41136304, "router_z_loss_mlp": 0.19934082, "routerloss_mlp": 0.0, "step": 495, "time_per_iteration": 2.4990835189819336 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01215397, "balance_loss_mlp": 1.19448745, "diversity_loss_mlp": 0.0, "epoch": 0.0954213158907272, "flos": 691016583168.0, "grad_norm": 0.11525575263217126, "language_loss": 0.92808712, "learning_rate": 0.0009888219244713326, "loss": 0.94024116, "num_input_tokens_seen": 41212384, "router_z_loss_mlp": 0.20922852, "routerloss_mlp": 0.0, "step": 496, "time_per_iteration": 2.828477382659912 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01235818, "balance_loss_mlp": 1.2138716, "diversity_loss_mlp": 0.0, "epoch": 0.09561369757599077, "flos": 519005325312.0, "grad_norm": 0.13708349411569606, "language_loss": 0.92383498, "learning_rate": 0.0009887563223026229, "loss": 0.93619317, "num_input_tokens_seen": 41282528, "router_z_loss_mlp": 0.21948242, "routerloss_mlp": 0.0, "step": 497, "time_per_iteration": 2.6688501834869385 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.03358766, "balance_loss_mlp": 3.33902526, "diversity_loss_mlp": 0.0, "epoch": 0.09580607926125433, "flos": 1385614812672.0, "grad_norm": 0.4973253845941573, "language_loss": 0.7906816, "learning_rate": 0.0009886905303818805, "loss": 0.82426929, "num_input_tokens_seen": 41512256, "router_z_loss_mlp": 0.19726562, "routerloss_mlp": 0.0, "step": 498, "time_per_iteration": 4.9225428104400635 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0125204, "balance_loss_mlp": 1.22810328, "diversity_loss_mlp": 0.0, "epoch": 0.09599846094651789, "flos": 717436901376.0, "grad_norm": 0.09338533863845942, "language_loss": 0.9145627, "learning_rate": 0.0009886245487346482, "loss": 0.92708313, "num_input_tokens_seen": 41596816, "router_z_loss_mlp": 0.23925781, "routerloss_mlp": 0.0, "step": 499, "time_per_iteration": 3.0396392345428467 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01273949, "balance_loss_mlp": 1.24874783, "diversity_loss_mlp": 0.0, "epoch": 0.09619084263178146, "flos": 386038130688.0, "grad_norm": 0.12406156723875504, "language_loss": 0.94657683, "learning_rate": 0.0009885583773865422, "loss": 0.95931631, "num_input_tokens_seen": 41658544, "router_z_loss_mlp": 0.2520752, "routerloss_mlp": 0.0, "step": 500, "time_per_iteration": 2.434283971786499 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01319213, "balance_loss_mlp": 1.29096031, "diversity_loss_mlp": 0.0, "epoch": 0.09638322431704502, "flos": 534129292800.0, "grad_norm": 0.11518840252548597, "language_loss": 0.91528684, "learning_rate": 0.0009884920163632524, "loss": 0.92847896, "num_input_tokens_seen": 41730736, "router_z_loss_mlp": 0.2824707, "routerloss_mlp": 0.0, "step": 501, "time_per_iteration": 2.6888957023620605 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0131255, "balance_loss_mlp": 1.28246212, "diversity_loss_mlp": 0.0, "epoch": 0.09657560600230858, "flos": 500671922688.0, "grad_norm": 0.12991803618191863, "language_loss": 0.93797207, "learning_rate": 0.000988425465690543, "loss": 0.95109755, "num_input_tokens_seen": 41797824, "router_z_loss_mlp": 0.30102539, "routerloss_mlp": 0.0, "step": 502, "time_per_iteration": 2.5672004222869873 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01283439, "balance_loss_mlp": 1.25225365, "diversity_loss_mlp": 0.0, "epoch": 0.09676798768757214, "flos": 529261724160.0, "grad_norm": 0.11000587000012971, "language_loss": 0.91223967, "learning_rate": 0.0009883587253942505, "loss": 0.92507404, "num_input_tokens_seen": 41875520, "router_z_loss_mlp": 0.31152344, "routerloss_mlp": 0.0, "step": 503, "time_per_iteration": 2.7560157775878906 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01273545, "balance_loss_mlp": 1.24281311, "diversity_loss_mlp": 0.0, "epoch": 0.09696036937283571, "flos": 463614534144.0, "grad_norm": 0.10509235815923167, "language_loss": 0.97371984, "learning_rate": 0.0009882917955002862, "loss": 0.9864552, "num_input_tokens_seen": 41942224, "router_z_loss_mlp": 0.30712891, "routerloss_mlp": 0.0, "step": 504, "time_per_iteration": 2.5183091163635254 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01227481, "balance_loss_mlp": 1.1978929, "diversity_loss_mlp": 0.0, "epoch": 0.09715275105809927, "flos": 534974326272.0, "grad_norm": 0.11004475447178139, "language_loss": 0.90284961, "learning_rate": 0.0009882246760346343, "loss": 0.91512442, "num_input_tokens_seen": 42007552, "router_z_loss_mlp": 0.2956543, "routerloss_mlp": 0.0, "step": 505, "time_per_iteration": 2.6169376373291016 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01215441, "balance_loss_mlp": 1.18637753, "diversity_loss_mlp": 0.0, "epoch": 0.09734513274336283, "flos": 454946886144.0, "grad_norm": 0.13294554223904492, "language_loss": 0.94025862, "learning_rate": 0.0009881573670233533, "loss": 0.95241302, "num_input_tokens_seen": 42071760, "router_z_loss_mlp": 0.29077148, "routerloss_mlp": 0.0, "step": 506, "time_per_iteration": 2.5373079776763916 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.012064, "balance_loss_mlp": 1.17976809, "diversity_loss_mlp": 0.0, "epoch": 0.09753751442862639, "flos": 508805826048.0, "grad_norm": 0.07932421313758002, "language_loss": 0.89223576, "learning_rate": 0.0009880898684925747, "loss": 0.90429974, "num_input_tokens_seen": 42140688, "router_z_loss_mlp": 0.26660156, "routerloss_mlp": 0.0, "step": 507, "time_per_iteration": 2.661796808242798 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01206827, "balance_loss_mlp": 1.18070853, "diversity_loss_mlp": 0.0, "epoch": 0.09772989611388996, "flos": 484273064448.0, "grad_norm": 0.09132088261693337, "language_loss": 0.87935519, "learning_rate": 0.0009880221804685037, "loss": 0.89142346, "num_input_tokens_seen": 42208544, "router_z_loss_mlp": 0.26159668, "routerloss_mlp": 0.0, "step": 508, "time_per_iteration": 2.542513608932495 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02552291, "balance_loss_mlp": 2.42869496, "diversity_loss_mlp": 0.0, "epoch": 0.09792227779915352, "flos": 1566106140672.0, "grad_norm": 0.1282373293100265, "language_loss": 0.79344422, "learning_rate": 0.000987954302977419, "loss": 0.8189671, "num_input_tokens_seen": 42426624, "router_z_loss_mlp": 1.234375, "routerloss_mlp": 0.0, "step": 509, "time_per_iteration": 4.707206964492798 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01280503, "balance_loss_mlp": 1.25399113, "diversity_loss_mlp": 0.0, "epoch": 0.09811465948441708, "flos": 587805424128.0, "grad_norm": 0.09929466646798928, "language_loss": 0.93586993, "learning_rate": 0.0009878862360456733, "loss": 0.94867498, "num_input_tokens_seen": 42494592, "router_z_loss_mlp": 0.265625, "routerloss_mlp": 0.0, "step": 510, "time_per_iteration": 2.6981284618377686 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01284628, "balance_loss_mlp": 1.25883126, "diversity_loss_mlp": 0.0, "epoch": 0.09830704116968064, "flos": 613000410624.0, "grad_norm": 0.10250849932844218, "language_loss": 0.87516463, "learning_rate": 0.0009878179796996922, "loss": 0.88801086, "num_input_tokens_seen": 42564944, "router_z_loss_mlp": 0.25817871, "routerloss_mlp": 0.0, "step": 511, "time_per_iteration": 2.7541561126708984 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01281708, "balance_loss_mlp": 1.25468373, "diversity_loss_mlp": 0.0, "epoch": 0.09849942285494422, "flos": 538808910336.0, "grad_norm": 0.10234956077068923, "language_loss": 0.90780497, "learning_rate": 0.0009877495339659754, "loss": 0.92062211, "num_input_tokens_seen": 42645616, "router_z_loss_mlp": 0.27038574, "routerloss_mlp": 0.0, "step": 512, "time_per_iteration": 2.7744665145874023 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01278173, "balance_loss_mlp": 1.25241184, "diversity_loss_mlp": 0.0, "epoch": 0.09869180454020778, "flos": 620474660352.0, "grad_norm": 0.11291475079800635, "language_loss": 0.85683644, "learning_rate": 0.000987680898871096, "loss": 0.86961818, "num_input_tokens_seen": 42713632, "router_z_loss_mlp": 0.2578125, "routerloss_mlp": 0.0, "step": 513, "time_per_iteration": 2.8321592807769775 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01289018, "balance_loss_mlp": 1.26217198, "diversity_loss_mlp": 0.0, "epoch": 0.09888418622547133, "flos": 811711342080.0, "grad_norm": 0.10190264212433507, "language_loss": 0.85800934, "learning_rate": 0.0009876120744417, "loss": 0.87089956, "num_input_tokens_seen": 42789088, "router_z_loss_mlp": 0.26867676, "routerloss_mlp": 0.0, "step": 514, "time_per_iteration": 2.945312023162842 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01245022, "balance_loss_mlp": 1.2198211, "diversity_loss_mlp": 0.0, "epoch": 0.0990765679107349, "flos": 535809447936.0, "grad_norm": 0.09616865198011539, "language_loss": 0.94088352, "learning_rate": 0.0009875430607045078, "loss": 0.9533338, "num_input_tokens_seen": 42861168, "router_z_loss_mlp": 0.2520752, "routerloss_mlp": 0.0, "step": 515, "time_per_iteration": 2.656282663345337 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01214395, "balance_loss_mlp": 1.19058895, "diversity_loss_mlp": 0.0, "epoch": 0.09926894959599845, "flos": 587879576064.0, "grad_norm": 0.0895550710797692, "language_loss": 0.91242373, "learning_rate": 0.000987473857686313, "loss": 0.9245677, "num_input_tokens_seen": 42934112, "router_z_loss_mlp": 0.23791504, "routerloss_mlp": 0.0, "step": 516, "time_per_iteration": 2.7530250549316406 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01218622, "balance_loss_mlp": 1.19458985, "diversity_loss_mlp": 0.0, "epoch": 0.09946133128126203, "flos": 641234506752.0, "grad_norm": 0.11626991588591096, "language_loss": 0.92559797, "learning_rate": 0.0009874044654139824, "loss": 0.93778414, "num_input_tokens_seen": 43005248, "router_z_loss_mlp": 0.24023438, "routerloss_mlp": 0.0, "step": 517, "time_per_iteration": 2.7673146724700928 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01188049, "balance_loss_mlp": 1.16410005, "diversity_loss_mlp": 0.0, "epoch": 0.09965371296652559, "flos": 465781446144.0, "grad_norm": 0.09260385447056875, "language_loss": 0.91065013, "learning_rate": 0.0009873348839144563, "loss": 0.92253065, "num_input_tokens_seen": 43070576, "router_z_loss_mlp": 0.23950195, "routerloss_mlp": 0.0, "step": 518, "time_per_iteration": 2.5385515689849854 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01162168, "balance_loss_mlp": 1.13979197, "diversity_loss_mlp": 0.0, "epoch": 0.09984609465178915, "flos": 483603499008.0, "grad_norm": 0.07604390633760301, "language_loss": 0.95252264, "learning_rate": 0.000987265113214749, "loss": 0.96414435, "num_input_tokens_seen": 43138048, "router_z_loss_mlp": 0.22375488, "routerloss_mlp": 0.0, "step": 519, "time_per_iteration": 2.556882619857788 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01171262, "balance_loss_mlp": 1.14849353, "diversity_loss_mlp": 0.0, "epoch": 0.1000384763370527, "flos": 569029681152.0, "grad_norm": 0.093032650642813, "language_loss": 0.94720447, "learning_rate": 0.0009871951533419476, "loss": 0.95891708, "num_input_tokens_seen": 43207600, "router_z_loss_mlp": 0.22753906, "routerloss_mlp": 0.0, "step": 520, "time_per_iteration": 2.724825143814087 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01163285, "balance_loss_mlp": 1.14063525, "diversity_loss_mlp": 0.0, "epoch": 0.10023085802231628, "flos": 545796403200.0, "grad_norm": 0.07732484115861517, "language_loss": 0.87440532, "learning_rate": 0.0009871250043232132, "loss": 0.88603818, "num_input_tokens_seen": 43285104, "router_z_loss_mlp": 0.22631836, "routerloss_mlp": 0.0, "step": 521, "time_per_iteration": 2.756647825241089 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01171709, "balance_loss_mlp": 1.14840364, "diversity_loss_mlp": 0.0, "epoch": 0.10042323970757984, "flos": 503454071808.0, "grad_norm": 0.08586449419627491, "language_loss": 0.8592059, "learning_rate": 0.0009870546661857797, "loss": 0.87092298, "num_input_tokens_seen": 43353312, "router_z_loss_mlp": 0.23291016, "routerloss_mlp": 0.0, "step": 522, "time_per_iteration": 2.611241340637207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01188262, "balance_loss_mlp": 1.16447985, "diversity_loss_mlp": 0.0, "epoch": 0.1006156213928434, "flos": 770411533824.0, "grad_norm": 0.11121774977632432, "language_loss": 0.93899059, "learning_rate": 0.0009869841389569553, "loss": 0.9508732, "num_input_tokens_seen": 43427680, "router_z_loss_mlp": 0.2376709, "routerloss_mlp": 0.0, "step": 523, "time_per_iteration": 2.986001491546631 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00897074, "balance_loss_mlp": 1.51972795, "diversity_loss_mlp": 0.23477924, "epoch": 0.10080800307810696, "flos": 490030083072.0, "grad_norm": 0.04055297882665198, "language_loss": 0.88430732, "learning_rate": 0.0009869134226641206, "loss": 0.89327806, "num_input_tokens_seen": 43495200, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01982057, "step": 524, "time_per_iteration": 2.5944766998291016 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01213869, "balance_loss_mlp": 1.19106424, "diversity_loss_mlp": 0.0, "epoch": 0.10100038476337053, "flos": 454724430336.0, "grad_norm": 0.1040439940574723, "language_loss": 0.87633705, "learning_rate": 0.0009868425173347303, "loss": 0.88847572, "num_input_tokens_seen": 43566256, "router_z_loss_mlp": 0.22814941, "routerloss_mlp": 0.0, "step": 525, "time_per_iteration": 2.679245710372925 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01202393, "balance_loss_mlp": 1.17973125, "diversity_loss_mlp": 0.0, "epoch": 0.10119276644863409, "flos": 556438242816.0, "grad_norm": 0.10306076043273057, "language_loss": 0.95430547, "learning_rate": 0.0009867714229963125, "loss": 0.96632946, "num_input_tokens_seen": 43639696, "router_z_loss_mlp": 0.2265625, "routerloss_mlp": 0.0, "step": 526, "time_per_iteration": 2.6960504055023193 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01194179, "balance_loss_mlp": 1.17121899, "diversity_loss_mlp": 0.0, "epoch": 0.10138514813389765, "flos": 516235659264.0, "grad_norm": 0.13221329860014494, "language_loss": 0.92439747, "learning_rate": 0.000986700139676468, "loss": 0.93633932, "num_input_tokens_seen": 43703872, "router_z_loss_mlp": 0.22937012, "routerloss_mlp": 0.0, "step": 527, "time_per_iteration": 2.5740442276000977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01226752, "balance_loss_mlp": 1.20331526, "diversity_loss_mlp": 0.0, "epoch": 0.10157752981916121, "flos": 500570606592.0, "grad_norm": 0.07480383753700154, "language_loss": 0.90178651, "learning_rate": 0.0009866286674028717, "loss": 0.91405398, "num_input_tokens_seen": 43774416, "router_z_loss_mlp": 0.23425293, "routerloss_mlp": 0.0, "step": 528, "time_per_iteration": 2.6214394569396973 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00901033, "balance_loss_mlp": 1.53179681, "diversity_loss_mlp": 0.23385583, "epoch": 0.10176991150442478, "flos": 656773277184.0, "grad_norm": 0.042015219172821444, "language_loss": 0.87127066, "learning_rate": 0.0009865570062032717, "loss": 0.88028097, "num_input_tokens_seen": 43853376, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01820667, "step": 529, "time_per_iteration": 2.947612762451172 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01243163, "balance_loss_mlp": 1.21885657, "diversity_loss_mlp": 0.0, "epoch": 0.10196229318968834, "flos": 573259617792.0, "grad_norm": 0.11620953964099495, "language_loss": 0.91896212, "learning_rate": 0.0009864851561054893, "loss": 0.93139374, "num_input_tokens_seen": 43929632, "router_z_loss_mlp": 0.24304199, "routerloss_mlp": 0.0, "step": 530, "time_per_iteration": 2.8097901344299316 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01192516, "balance_loss_mlp": 1.16937733, "diversity_loss_mlp": 0.0, "epoch": 0.1021546748749519, "flos": 518207279616.0, "grad_norm": 0.0991735208834069, "language_loss": 0.90383148, "learning_rate": 0.0009864131171374191, "loss": 0.9157567, "num_input_tokens_seen": 44002144, "router_z_loss_mlp": 0.23132324, "routerloss_mlp": 0.0, "step": 531, "time_per_iteration": 2.6775832176208496 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01169363, "balance_loss_mlp": 1.14682031, "diversity_loss_mlp": 0.0, "epoch": 0.10234705656021546, "flos": 609766009344.0, "grad_norm": 0.08125371515716559, "language_loss": 0.90489674, "learning_rate": 0.0009863408893270292, "loss": 0.91659039, "num_input_tokens_seen": 44078272, "router_z_loss_mlp": 0.2253418, "routerloss_mlp": 0.0, "step": 532, "time_per_iteration": 2.7877254486083984 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01134136, "balance_loss_mlp": 1.1120224, "diversity_loss_mlp": 0.0, "epoch": 0.10253943824547904, "flos": 601760586240.0, "grad_norm": 0.11770570969906818, "language_loss": 0.85183895, "learning_rate": 0.0009862684727023605, "loss": 0.8631804, "num_input_tokens_seen": 44152304, "router_z_loss_mlp": 0.22131348, "routerloss_mlp": 0.0, "step": 533, "time_per_iteration": 2.717573642730713 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128208, "balance_loss_mlp": 1.10571277, "diversity_loss_mlp": 0.0, "epoch": 0.1027318199307426, "flos": 662948043264.0, "grad_norm": 0.10673213842736717, "language_loss": 0.88664484, "learning_rate": 0.0009861958672915283, "loss": 0.89792687, "num_input_tokens_seen": 44226720, "router_z_loss_mlp": 0.22497559, "routerloss_mlp": 0.0, "step": 534, "time_per_iteration": 2.7880847454071045 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111155, "balance_loss_mlp": 1.08948302, "diversity_loss_mlp": 0.0, "epoch": 0.10292420161600616, "flos": 683275461120.0, "grad_norm": 0.11915216532291298, "language_loss": 0.88834876, "learning_rate": 0.0009861230731227201, "loss": 0.89946032, "num_input_tokens_seen": 44303600, "router_z_loss_mlp": 0.21679688, "routerloss_mlp": 0.0, "step": 535, "time_per_iteration": 2.844203233718872 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121725, "balance_loss_mlp": 1.10002935, "diversity_loss_mlp": 0.0, "epoch": 0.10311658330126972, "flos": 490287043584.0, "grad_norm": 0.11019657032079996, "language_loss": 0.90318179, "learning_rate": 0.0009860500902241973, "loss": 0.91439903, "num_input_tokens_seen": 44370960, "router_z_loss_mlp": 0.21716309, "routerloss_mlp": 0.0, "step": 536, "time_per_iteration": 2.5753133296966553 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126411, "balance_loss_mlp": 1.10444033, "diversity_loss_mlp": 0.0, "epoch": 0.10330896498653329, "flos": 431743343616.0, "grad_norm": 0.13353850851854182, "language_loss": 0.95278764, "learning_rate": 0.0009859769186242942, "loss": 0.96405172, "num_input_tokens_seen": 44435584, "router_z_loss_mlp": 0.21984863, "routerloss_mlp": 0.0, "step": 537, "time_per_iteration": 2.544611930847168 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00894726, "balance_loss_mlp": 1.52693653, "diversity_loss_mlp": 0.22699235, "epoch": 0.10350134667179685, "flos": 549591713280.0, "grad_norm": 0.04205207536563703, "language_loss": 0.88558614, "learning_rate": 0.0009859035583514187, "loss": 0.8945334, "num_input_tokens_seen": 44505456, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01776124, "step": 538, "time_per_iteration": 2.647594451904297 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01257859, "balance_loss_mlp": 1.23475599, "diversity_loss_mlp": 0.0, "epoch": 0.10369372835706041, "flos": 640626610176.0, "grad_norm": 0.11200334451020948, "language_loss": 0.89448857, "learning_rate": 0.0009858300094340517, "loss": 0.90706718, "num_input_tokens_seen": 44580208, "router_z_loss_mlp": 0.23071289, "routerloss_mlp": 0.0, "step": 539, "time_per_iteration": 2.7679364681243896 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01291272, "balance_loss_mlp": 1.26785898, "diversity_loss_mlp": 0.0, "epoch": 0.10388611004232397, "flos": 521752969728.0, "grad_norm": 0.17493624211104222, "language_loss": 0.84562349, "learning_rate": 0.0009857562719007473, "loss": 0.85853624, "num_input_tokens_seen": 44646576, "router_z_loss_mlp": 0.23388672, "routerloss_mlp": 0.0, "step": 540, "time_per_iteration": 2.6256375312805176 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01267144, "balance_loss_mlp": 1.24492311, "diversity_loss_mlp": 0.0, "epoch": 0.10407849172758753, "flos": 702436644864.0, "grad_norm": 0.14114133743563548, "language_loss": 0.86615884, "learning_rate": 0.0009856823457801331, "loss": 0.87883031, "num_input_tokens_seen": 44726752, "router_z_loss_mlp": 0.22229004, "routerloss_mlp": 0.0, "step": 541, "time_per_iteration": 2.8773691654205322 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01254714, "balance_loss_mlp": 1.23256469, "diversity_loss_mlp": 0.0, "epoch": 0.1042708734128511, "flos": 502910415360.0, "grad_norm": 0.08733197639022866, "language_loss": 0.93604994, "learning_rate": 0.00098560823110091, "loss": 0.94859707, "num_input_tokens_seen": 44795824, "router_z_loss_mlp": 0.22167969, "routerloss_mlp": 0.0, "step": 542, "time_per_iteration": 2.6173057556152344 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01206738, "balance_loss_mlp": 1.18436217, "diversity_loss_mlp": 0.0, "epoch": 0.10446325509811466, "flos": 485592371712.0, "grad_norm": 0.14252191795618116, "language_loss": 0.94814467, "learning_rate": 0.000985533927891851, "loss": 0.96021199, "num_input_tokens_seen": 44868496, "router_z_loss_mlp": 0.22387695, "routerloss_mlp": 0.0, "step": 543, "time_per_iteration": 2.682035207748413 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00924177, "balance_loss_mlp": 1.58877563, "diversity_loss_mlp": 0.22542018, "epoch": 0.10465563678337822, "flos": 568634328576.0, "grad_norm": 0.04171093567104517, "language_loss": 0.92462713, "learning_rate": 0.0009854594361818044, "loss": 0.93386889, "num_input_tokens_seen": 44939888, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01707876, "step": 544, "time_per_iteration": 2.771606922149658 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01134006, "balance_loss_mlp": 1.11126077, "diversity_loss_mlp": 0.0, "epoch": 0.10484801846864178, "flos": 626093286912.0, "grad_norm": 0.16622789723447462, "language_loss": 0.91736549, "learning_rate": 0.0009853847559996897, "loss": 0.92870551, "num_input_tokens_seen": 45012720, "router_z_loss_mlp": 0.22729492, "routerloss_mlp": 0.0, "step": 545, "time_per_iteration": 2.714980363845825 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01131307, "balance_loss_mlp": 1.10896707, "diversity_loss_mlp": 0.0, "epoch": 0.10504040015390535, "flos": 743412681216.0, "grad_norm": 0.13863422454282084, "language_loss": 0.90834534, "learning_rate": 0.0009853098873745, "loss": 0.91965836, "num_input_tokens_seen": 45093744, "router_z_loss_mlp": 0.22351074, "routerloss_mlp": 0.0, "step": 546, "time_per_iteration": 2.98349928855896 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01127677, "balance_loss_mlp": 1.10500383, "diversity_loss_mlp": 0.0, "epoch": 0.10523278183916891, "flos": 586673694720.0, "grad_norm": 0.15888834478547278, "language_loss": 0.90073705, "learning_rate": 0.0009852348303353027, "loss": 0.91201389, "num_input_tokens_seen": 45172784, "router_z_loss_mlp": 0.22668457, "routerloss_mlp": 0.0, "step": 547, "time_per_iteration": 2.782012701034546 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01148987, "balance_loss_mlp": 1.12613487, "diversity_loss_mlp": 0.0, "epoch": 0.10542516352443247, "flos": 869644574208.0, "grad_norm": 0.10179846154330349, "language_loss": 0.82990968, "learning_rate": 0.000985159584911237, "loss": 0.84139955, "num_input_tokens_seen": 45255600, "router_z_loss_mlp": 0.22839355, "routerloss_mlp": 0.0, "step": 548, "time_per_iteration": 3.102688789367676 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01216658, "balance_loss_mlp": 1.19307828, "diversity_loss_mlp": 0.0, "epoch": 0.10561754520969603, "flos": 505428263424.0, "grad_norm": 0.12466178148261096, "language_loss": 0.89916652, "learning_rate": 0.0009850841511315162, "loss": 0.91133308, "num_input_tokens_seen": 45325072, "router_z_loss_mlp": 0.2355957, "routerloss_mlp": 0.0, "step": 549, "time_per_iteration": 2.61226749420166 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01241093, "balance_loss_mlp": 1.21708441, "diversity_loss_mlp": 0.0, "epoch": 0.1058099268949596, "flos": 559981361664.0, "grad_norm": 0.11901003741868514, "language_loss": 0.90615034, "learning_rate": 0.0009850085290254256, "loss": 0.91856128, "num_input_tokens_seen": 45401440, "router_z_loss_mlp": 0.23986816, "routerloss_mlp": 0.0, "step": 550, "time_per_iteration": 2.7958199977874756 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00914838, "balance_loss_mlp": 1.5724771, "diversity_loss_mlp": 0.22113116, "epoch": 0.10600230858022316, "flos": 562049528832.0, "grad_norm": 0.03122458898086593, "language_loss": 0.87977409, "learning_rate": 0.0009849327186223246, "loss": 0.88892245, "num_input_tokens_seen": 45479264, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0180343, "step": 551, "time_per_iteration": 2.799394130706787 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01242815, "balance_loss_mlp": 1.21818638, "diversity_loss_mlp": 0.0, "epoch": 0.10619469026548672, "flos": 494326831104.0, "grad_norm": 0.10957849833176474, "language_loss": 0.95181417, "learning_rate": 0.000984856719951646, "loss": 0.96424234, "num_input_tokens_seen": 45547328, "router_z_loss_mlp": 0.24609375, "routerloss_mlp": 0.0, "step": 552, "time_per_iteration": 2.559286117553711 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0121032, "balance_loss_mlp": 1.18546462, "diversity_loss_mlp": 0.0, "epoch": 0.10638707195075028, "flos": 676166828544.0, "grad_norm": 0.09349197696587547, "language_loss": 0.91760498, "learning_rate": 0.0009847805330428943, "loss": 0.92970818, "num_input_tokens_seen": 45631152, "router_z_loss_mlp": 0.24865723, "routerloss_mlp": 0.0, "step": 553, "time_per_iteration": 2.906571388244629 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00875983, "balance_loss_mlp": 1.49139261, "diversity_loss_mlp": 0.22127438, "epoch": 0.10657945363601386, "flos": 488055891456.0, "grad_norm": 0.05457604420902532, "language_loss": 0.93558431, "learning_rate": 0.0009847041579256481, "loss": 0.94434416, "num_input_tokens_seen": 45698208, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01964992, "step": 554, "time_per_iteration": 2.6159372329711914 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01202664, "balance_loss_mlp": 1.17859542, "diversity_loss_mlp": 0.0, "epoch": 0.10677183532127742, "flos": 482958526464.0, "grad_norm": 0.08395889420783041, "language_loss": 0.94042808, "learning_rate": 0.0009846275946295592, "loss": 0.95245475, "num_input_tokens_seen": 45766640, "router_z_loss_mlp": 0.24072266, "routerloss_mlp": 0.0, "step": 555, "time_per_iteration": 2.592341184616089 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01182493, "balance_loss_mlp": 1.15904498, "diversity_loss_mlp": 0.0, "epoch": 0.10696421700654098, "flos": 656249444352.0, "grad_norm": 0.08262845202589308, "language_loss": 0.8740595, "learning_rate": 0.0009845508431843518, "loss": 0.8858844, "num_input_tokens_seen": 45851408, "router_z_loss_mlp": 0.23425293, "routerloss_mlp": 0.0, "step": 556, "time_per_iteration": 3.0123813152313232 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01177615, "balance_loss_mlp": 1.15481031, "diversity_loss_mlp": 0.0, "epoch": 0.10715659869180454, "flos": 567744878592.0, "grad_norm": 0.07593810566908125, "language_loss": 0.88148719, "learning_rate": 0.0009844739036198233, "loss": 0.8932634, "num_input_tokens_seen": 45919824, "router_z_loss_mlp": 0.22814941, "routerloss_mlp": 0.0, "step": 557, "time_per_iteration": 2.6356143951416016 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01184514, "balance_loss_mlp": 1.16157842, "diversity_loss_mlp": 0.0, "epoch": 0.10734898037706811, "flos": 540694268928.0, "grad_norm": 0.09177793780956148, "language_loss": 0.94916999, "learning_rate": 0.0009843967759658448, "loss": 0.96101511, "num_input_tokens_seen": 45991024, "router_z_loss_mlp": 0.22912598, "routerloss_mlp": 0.0, "step": 558, "time_per_iteration": 2.6546378135681152 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02293865, "balance_loss_mlp": 2.17026901, "diversity_loss_mlp": 0.0, "epoch": 0.10754136206233167, "flos": 1476640171008.0, "grad_norm": 0.09925677209713644, "language_loss": 0.72767758, "learning_rate": 0.0009843194602523592, "loss": 0.75061619, "num_input_tokens_seen": 46212736, "router_z_loss_mlp": 1.234375, "routerloss_mlp": 0.0, "step": 559, "time_per_iteration": 4.829499244689941 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01207667, "balance_loss_mlp": 1.18555331, "diversity_loss_mlp": 0.0, "epoch": 0.10773374374759523, "flos": 512405844480.0, "grad_norm": 0.1031420062274817, "language_loss": 0.9552027, "learning_rate": 0.000984241956509384, "loss": 0.96727937, "num_input_tokens_seen": 46283920, "router_z_loss_mlp": 0.22131348, "routerloss_mlp": 0.0, "step": 560, "time_per_iteration": 2.65759539604187 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01204016, "balance_loss_mlp": 1.18220043, "diversity_loss_mlp": 0.0, "epoch": 0.10792612543285879, "flos": 496503654912.0, "grad_norm": 0.08944048757536185, "language_loss": 0.90505213, "learning_rate": 0.0009841642647670078, "loss": 0.91709226, "num_input_tokens_seen": 46349664, "router_z_loss_mlp": 0.21826172, "routerloss_mlp": 0.0, "step": 561, "time_per_iteration": 2.591806173324585 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01194467, "balance_loss_mlp": 1.17308092, "diversity_loss_mlp": 0.0, "epoch": 0.10811850711812235, "flos": 735471498240.0, "grad_norm": 0.08297191380839272, "language_loss": 0.85483265, "learning_rate": 0.0009840863850553944, "loss": 0.8667773, "num_input_tokens_seen": 46432688, "router_z_loss_mlp": 0.21398926, "routerloss_mlp": 0.0, "step": 562, "time_per_iteration": 2.963149309158325 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01179499, "balance_loss_mlp": 1.15856552, "diversity_loss_mlp": 0.0, "epoch": 0.10831088880338592, "flos": 611540140032.0, "grad_norm": 0.18759249419324772, "language_loss": 0.9088884, "learning_rate": 0.0009840083174047782, "loss": 0.92068338, "num_input_tokens_seen": 46507216, "router_z_loss_mlp": 0.20947266, "routerloss_mlp": 0.0, "step": 563, "time_per_iteration": 2.71415114402771 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01169496, "balance_loss_mlp": 1.14940953, "diversity_loss_mlp": 0.0, "epoch": 0.10850327048864948, "flos": 556317103104.0, "grad_norm": 0.08351477183844232, "language_loss": 0.86295354, "learning_rate": 0.0009839300618454685, "loss": 0.87464857, "num_input_tokens_seen": 46590464, "router_z_loss_mlp": 0.20080566, "routerloss_mlp": 0.0, "step": 564, "time_per_iteration": 2.8288042545318604 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01163807, "balance_loss_mlp": 1.14280224, "diversity_loss_mlp": 0.0, "epoch": 0.10869565217391304, "flos": 603208373760.0, "grad_norm": 0.0761185875884483, "language_loss": 0.9141686, "learning_rate": 0.0009838516184078466, "loss": 0.92580664, "num_input_tokens_seen": 46666240, "router_z_loss_mlp": 0.21020508, "routerloss_mlp": 0.0, "step": 565, "time_per_iteration": 2.8194022178649902 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01177734, "balance_loss_mlp": 1.15682447, "diversity_loss_mlp": 0.0, "epoch": 0.1088880338591766, "flos": 526178198016.0, "grad_norm": 0.14122321260962364, "language_loss": 0.88377023, "learning_rate": 0.0009837729871223669, "loss": 0.89554763, "num_input_tokens_seen": 46734288, "router_z_loss_mlp": 0.20922852, "routerloss_mlp": 0.0, "step": 566, "time_per_iteration": 2.6096079349517822 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01194985, "balance_loss_mlp": 1.17372978, "diversity_loss_mlp": 0.0, "epoch": 0.10908041554444017, "flos": 620272028160.0, "grad_norm": 0.1066586812750682, "language_loss": 0.88896918, "learning_rate": 0.0009836941680195568, "loss": 0.90091902, "num_input_tokens_seen": 46809920, "router_z_loss_mlp": 0.21264648, "routerloss_mlp": 0.0, "step": 567, "time_per_iteration": 2.779846429824829 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01210465, "balance_loss_mlp": 1.18900692, "diversity_loss_mlp": 0.0, "epoch": 0.10927279722970373, "flos": 898125719040.0, "grad_norm": 0.09744135285550241, "language_loss": 0.84777021, "learning_rate": 0.0009836151611300166, "loss": 0.85987484, "num_input_tokens_seen": 46889984, "router_z_loss_mlp": 0.21472168, "routerloss_mlp": 0.0, "step": 568, "time_per_iteration": 3.2130274772644043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01210546, "balance_loss_mlp": 1.18979168, "diversity_loss_mlp": 0.0, "epoch": 0.10946517891496729, "flos": 528666310656.0, "grad_norm": 0.0877787159655237, "language_loss": 0.95202124, "learning_rate": 0.0009835359664844194, "loss": 0.96412671, "num_input_tokens_seen": 46959536, "router_z_loss_mlp": 0.2076416, "routerloss_mlp": 0.0, "step": 569, "time_per_iteration": 2.614626407623291 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02163392, "balance_loss_mlp": 2.12848806, "diversity_loss_mlp": 0.0, "epoch": 0.10965756060023085, "flos": 1560751815168.0, "grad_norm": 0.098326155744124, "language_loss": 0.81036806, "learning_rate": 0.0009834565841135114, "loss": 0.83200204, "num_input_tokens_seen": 47196960, "router_z_loss_mlp": 0.34960938, "routerloss_mlp": 0.0, "step": 570, "time_per_iteration": 4.910563230514526 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01188679, "balance_loss_mlp": 1.16738796, "diversity_loss_mlp": 0.0, "epoch": 0.10984994228549443, "flos": 513075409920.0, "grad_norm": 0.10673198509513786, "language_loss": 0.92503107, "learning_rate": 0.0009833770140481118, "loss": 0.93691778, "num_input_tokens_seen": 47266560, "router_z_loss_mlp": 0.21313477, "routerloss_mlp": 0.0, "step": 571, "time_per_iteration": 2.6361794471740723 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01167522, "balance_loss_mlp": 1.14587367, "diversity_loss_mlp": 0.0, "epoch": 0.11004232397075799, "flos": 954705139200.0, "grad_norm": 0.06757736028097705, "language_loss": 0.82720339, "learning_rate": 0.000983297256319112, "loss": 0.83887863, "num_input_tokens_seen": 47348512, "router_z_loss_mlp": 0.21655273, "routerloss_mlp": 0.0, "step": 572, "time_per_iteration": 3.2420709133148193 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01148716, "balance_loss_mlp": 1.12606621, "diversity_loss_mlp": 0.0, "epoch": 0.11023470565602154, "flos": 488181800448.0, "grad_norm": 0.09218112459591986, "language_loss": 0.87054348, "learning_rate": 0.000983217310957477, "loss": 0.88203067, "num_input_tokens_seen": 47425392, "router_z_loss_mlp": 0.2265625, "routerloss_mlp": 0.0, "step": 573, "time_per_iteration": 2.7485547065734863 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01139139, "balance_loss_mlp": 1.11725259, "diversity_loss_mlp": 0.0, "epoch": 0.1104270873412851, "flos": 655814817792.0, "grad_norm": 0.08282639029669561, "language_loss": 0.90421212, "learning_rate": 0.000983137177994244, "loss": 0.91560352, "num_input_tokens_seen": 47502336, "router_z_loss_mlp": 0.21899414, "routerloss_mlp": 0.0, "step": 574, "time_per_iteration": 2.8651185035705566 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01142425, "balance_loss_mlp": 1.11990607, "diversity_loss_mlp": 0.0, "epoch": 0.11061946902654868, "flos": 723426287616.0, "grad_norm": 0.08655490231030577, "language_loss": 0.8561765, "learning_rate": 0.0009830568574605235, "loss": 0.8676008, "num_input_tokens_seen": 47583552, "router_z_loss_mlp": 0.22521973, "routerloss_mlp": 0.0, "step": 575, "time_per_iteration": 2.942331075668335 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01162411, "balance_loss_mlp": 1.13946342, "diversity_loss_mlp": 0.0, "epoch": 0.11081185071181224, "flos": 835463310336.0, "grad_norm": 0.08792859421485215, "language_loss": 0.88113999, "learning_rate": 0.0009829763493874992, "loss": 0.89276409, "num_input_tokens_seen": 47663440, "router_z_loss_mlp": 0.22912598, "routerloss_mlp": 0.0, "step": 576, "time_per_iteration": 3.0282514095306396 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01173849, "balance_loss_mlp": 1.15098429, "diversity_loss_mlp": 0.0, "epoch": 0.1110042323970758, "flos": 609076620288.0, "grad_norm": 0.10676499351314739, "language_loss": 0.9303807, "learning_rate": 0.0009828956538064264, "loss": 0.94211912, "num_input_tokens_seen": 47741920, "router_z_loss_mlp": 0.2286377, "routerloss_mlp": 0.0, "step": 577, "time_per_iteration": 2.7946369647979736 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01173248, "balance_loss_mlp": 1.1503005, "diversity_loss_mlp": 0.0, "epoch": 0.11119661408233936, "flos": 595922075136.0, "grad_norm": 0.11074471638842859, "language_loss": 0.91223717, "learning_rate": 0.0009828147707486344, "loss": 0.92396963, "num_input_tokens_seen": 47815136, "router_z_loss_mlp": 0.22937012, "routerloss_mlp": 0.0, "step": 578, "time_per_iteration": 2.731588125228882 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0115684, "balance_loss_mlp": 1.13424993, "diversity_loss_mlp": 0.0, "epoch": 0.11138899576760293, "flos": 555835488768.0, "grad_norm": 0.09317476454713723, "language_loss": 0.86116958, "learning_rate": 0.0009827337002455245, "loss": 0.87273794, "num_input_tokens_seen": 47881360, "router_z_loss_mlp": 0.22583008, "routerloss_mlp": 0.0, "step": 579, "time_per_iteration": 2.639047145843506 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01134437, "balance_loss_mlp": 1.11184728, "diversity_loss_mlp": 0.0, "epoch": 0.11158137745286649, "flos": 689746461696.0, "grad_norm": 0.07918824025832125, "language_loss": 0.88299757, "learning_rate": 0.0009826524423285712, "loss": 0.89434195, "num_input_tokens_seen": 47962720, "router_z_loss_mlp": 0.22595215, "routerloss_mlp": 0.0, "step": 580, "time_per_iteration": 2.911012649536133 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0114105, "balance_loss_mlp": 1.11881745, "diversity_loss_mlp": 0.0, "epoch": 0.11177375913813005, "flos": 763011436032.0, "grad_norm": 0.10469703454021252, "language_loss": 0.89618349, "learning_rate": 0.0009825709970293218, "loss": 0.90759397, "num_input_tokens_seen": 48035472, "router_z_loss_mlp": 0.22229004, "routerloss_mlp": 0.0, "step": 581, "time_per_iteration": 2.8837828636169434 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135626, "balance_loss_mlp": 1.11433506, "diversity_loss_mlp": 0.0, "epoch": 0.11196614082339361, "flos": 806574329856.0, "grad_norm": 0.1022616119694228, "language_loss": 0.95317924, "learning_rate": 0.0009824893643793956, "loss": 0.96453559, "num_input_tokens_seen": 48116944, "router_z_loss_mlp": 0.21289062, "routerloss_mlp": 0.0, "step": 582, "time_per_iteration": 3.0962114334106445 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00948798, "balance_loss_mlp": 1.63779283, "diversity_loss_mlp": 0.22248407, "epoch": 0.11215852250865718, "flos": 558624978432.0, "grad_norm": 0.04350556393742171, "language_loss": 0.88843536, "learning_rate": 0.0009824075444104857, "loss": 0.89792335, "num_input_tokens_seen": 48187808, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01865991, "step": 583, "time_per_iteration": 2.719085454940796 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01157517, "balance_loss_mlp": 1.13638163, "diversity_loss_mlp": 0.0, "epoch": 0.11235090419392074, "flos": 513572078592.0, "grad_norm": 0.10740950198198211, "language_loss": 0.93831933, "learning_rate": 0.000982325537154357, "loss": 0.94989443, "num_input_tokens_seen": 48254464, "router_z_loss_mlp": 0.21154785, "routerloss_mlp": 0.0, "step": 584, "time_per_iteration": 2.597120523452759 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0117352, "balance_loss_mlp": 1.15234792, "diversity_loss_mlp": 0.0, "epoch": 0.1125432858791843, "flos": 491453277696.0, "grad_norm": 0.12322952105084124, "language_loss": 0.94442445, "learning_rate": 0.0009822433426428484, "loss": 0.95615965, "num_input_tokens_seen": 48318784, "router_z_loss_mlp": 0.21179199, "routerloss_mlp": 0.0, "step": 585, "time_per_iteration": 2.571805238723755 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01238103, "balance_loss_mlp": 1.2166214, "diversity_loss_mlp": 0.0, "epoch": 0.11273566756444786, "flos": 510725689344.0, "grad_norm": 0.08678287386034968, "language_loss": 0.87089044, "learning_rate": 0.0009821609609078697, "loss": 0.88327146, "num_input_tokens_seen": 48389248, "router_z_loss_mlp": 0.21484375, "routerloss_mlp": 0.0, "step": 586, "time_per_iteration": 2.586289405822754 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01320429, "balance_loss_mlp": 1.29861343, "diversity_loss_mlp": 0.0, "epoch": 0.11292804924971142, "flos": 622446280704.0, "grad_norm": 0.09324667942342675, "language_loss": 0.89581811, "learning_rate": 0.0009820783919814045, "loss": 0.90902239, "num_input_tokens_seen": 48463312, "router_z_loss_mlp": 0.21826172, "routerloss_mlp": 0.0, "step": 587, "time_per_iteration": 2.804417848587036 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01397697, "balance_loss_mlp": 1.37499988, "diversity_loss_mlp": 0.0, "epoch": 0.113120430934975, "flos": 478056453120.0, "grad_norm": 0.11766834316785481, "language_loss": 0.82825267, "learning_rate": 0.0009819956358955095, "loss": 0.8422296, "num_input_tokens_seen": 48531856, "router_z_loss_mlp": 0.22705078, "routerloss_mlp": 0.0, "step": 588, "time_per_iteration": 2.5654590129852295 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01433511, "balance_loss_mlp": 1.41009879, "diversity_loss_mlp": 0.0, "epoch": 0.11331281262023855, "flos": 467039084544.0, "grad_norm": 0.13254981657968556, "language_loss": 0.84316242, "learning_rate": 0.0009819126926823127, "loss": 0.85749757, "num_input_tokens_seen": 48596640, "router_z_loss_mlp": 0.23413086, "routerloss_mlp": 0.0, "step": 589, "time_per_iteration": 2.5090954303741455 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01369151, "balance_loss_mlp": 1.34720445, "diversity_loss_mlp": 0.0, "epoch": 0.11350519430550211, "flos": 650453151744.0, "grad_norm": 0.12923638752993147, "language_loss": 0.87131608, "learning_rate": 0.000981829562374016, "loss": 0.88500756, "num_input_tokens_seen": 48669648, "router_z_loss_mlp": 0.21948242, "routerloss_mlp": 0.0, "step": 590, "time_per_iteration": 2.7904558181762695 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01263432, "balance_loss_mlp": 1.24309444, "diversity_loss_mlp": 0.0, "epoch": 0.11369757599076567, "flos": 557809680384.0, "grad_norm": 0.0979331207375339, "language_loss": 0.97635686, "learning_rate": 0.0009817462450028933, "loss": 0.98899126, "num_input_tokens_seen": 48737392, "router_z_loss_mlp": 0.20336914, "routerloss_mlp": 0.0, "step": 591, "time_per_iteration": 2.6596498489379883 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01186211, "balance_loss_mlp": 1.16698265, "diversity_loss_mlp": 0.0, "epoch": 0.11388995767602925, "flos": 571080222720.0, "grad_norm": 0.0791908179615389, "language_loss": 0.85476398, "learning_rate": 0.0009816627406012916, "loss": 0.86662614, "num_input_tokens_seen": 48817136, "router_z_loss_mlp": 0.1920166, "routerloss_mlp": 0.0, "step": 592, "time_per_iteration": 2.795384168624878 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01143018, "balance_loss_mlp": 1.12423062, "diversity_loss_mlp": 0.0, "epoch": 0.1140823393612928, "flos": 740403307008.0, "grad_norm": 0.14133504737490046, "language_loss": 0.85158926, "learning_rate": 0.0009815790492016295, "loss": 0.86301947, "num_input_tokens_seen": 48895808, "router_z_loss_mlp": 0.18774414, "routerloss_mlp": 0.0, "step": 593, "time_per_iteration": 2.968202829360962 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0113857, "balance_loss_mlp": 1.11954474, "diversity_loss_mlp": 0.0, "epoch": 0.11427472104655637, "flos": 699004753920.0, "grad_norm": 0.10990083394980393, "language_loss": 0.87156999, "learning_rate": 0.0009814951708363993, "loss": 0.88295579, "num_input_tokens_seen": 48967456, "router_z_loss_mlp": 0.19006348, "routerloss_mlp": 0.0, "step": 594, "time_per_iteration": 2.8341050148010254 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01993613, "balance_loss_mlp": 1.96176016, "diversity_loss_mlp": 0.0, "epoch": 0.11446710273181993, "flos": 1477178684928.0, "grad_norm": 0.10325359814292956, "language_loss": 0.77990985, "learning_rate": 0.0009814111055381654, "loss": 0.79984605, "num_input_tokens_seen": 49193152, "router_z_loss_mlp": 0.31835938, "routerloss_mlp": 0.0, "step": 595, "time_per_iteration": 4.746119976043701 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0113071, "balance_loss_mlp": 1.11163688, "diversity_loss_mlp": 0.0, "epoch": 0.1146594844170835, "flos": 494895080448.0, "grad_norm": 0.1448933947746474, "language_loss": 0.89056683, "learning_rate": 0.0009813268533395648, "loss": 0.90187395, "num_input_tokens_seen": 49260960, "router_z_loss_mlp": 0.19067383, "routerloss_mlp": 0.0, "step": 596, "time_per_iteration": 2.592421054840088 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01151969, "balance_loss_mlp": 1.13301492, "diversity_loss_mlp": 0.0, "epoch": 0.11485186610234706, "flos": 474834534912.0, "grad_norm": 0.12455054099529249, "language_loss": 0.8755219, "learning_rate": 0.0009812424142733073, "loss": 0.88704157, "num_input_tokens_seen": 49327616, "router_z_loss_mlp": 0.18933105, "routerloss_mlp": 0.0, "step": 597, "time_per_iteration": 2.549654483795166 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01158973, "balance_loss_mlp": 1.13961387, "diversity_loss_mlp": 0.0, "epoch": 0.11504424778761062, "flos": 731209254912.0, "grad_norm": 0.1533400924271749, "language_loss": 0.86129421, "learning_rate": 0.000981157788372175, "loss": 0.87288398, "num_input_tokens_seen": 49412864, "router_z_loss_mlp": 0.19348145, "routerloss_mlp": 0.0, "step": 598, "time_per_iteration": 3.029372453689575 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01181573, "balance_loss_mlp": 1.16308403, "diversity_loss_mlp": 0.0, "epoch": 0.11523662947287418, "flos": 545823567360.0, "grad_norm": 0.08122879346901381, "language_loss": 0.89185023, "learning_rate": 0.0009810729756690223, "loss": 0.90366596, "num_input_tokens_seen": 49483584, "router_z_loss_mlp": 0.18481445, "routerloss_mlp": 0.0, "step": 599, "time_per_iteration": 2.72200608253479 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01225343, "balance_loss_mlp": 1.20584035, "diversity_loss_mlp": 0.0, "epoch": 0.11542901115813775, "flos": 775066558464.0, "grad_norm": 0.09322481346022114, "language_loss": 0.91937912, "learning_rate": 0.0009809879761967766, "loss": 0.93163252, "num_input_tokens_seen": 49563568, "router_z_loss_mlp": 0.19482422, "routerloss_mlp": 0.0, "step": 600, "time_per_iteration": 2.9454104900360107 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01240049, "balance_loss_mlp": 1.22046316, "diversity_loss_mlp": 0.0, "epoch": 0.11562139284340131, "flos": 730910449152.0, "grad_norm": 0.11235514763344263, "language_loss": 0.86727029, "learning_rate": 0.0009809027899884378, "loss": 0.87967086, "num_input_tokens_seen": 49640800, "router_z_loss_mlp": 0.19580078, "routerloss_mlp": 0.0, "step": 601, "time_per_iteration": 2.888047218322754 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01288764, "balance_loss_mlp": 1.26829576, "diversity_loss_mlp": 0.0, "epoch": 0.11581377452866487, "flos": 535878457344.0, "grad_norm": 0.07021797329248278, "language_loss": 0.88593882, "learning_rate": 0.0009808174170770779, "loss": 0.89882648, "num_input_tokens_seen": 49721872, "router_z_loss_mlp": 0.20458984, "routerloss_mlp": 0.0, "step": 602, "time_per_iteration": 2.8045670986175537 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02144093, "balance_loss_mlp": 2.11128712, "diversity_loss_mlp": 0.0, "epoch": 0.11600615621392843, "flos": 1555814863872.0, "grad_norm": 0.1124732092134732, "language_loss": 0.84898245, "learning_rate": 0.0009807318574958418, "loss": 0.87042338, "num_input_tokens_seen": 49951472, "router_z_loss_mlp": 0.328125, "routerloss_mlp": 0.0, "step": 603, "time_per_iteration": 4.899731397628784 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01341078, "balance_loss_mlp": 1.32069361, "diversity_loss_mlp": 0.0, "epoch": 0.116198537899192, "flos": 537435274752.0, "grad_norm": 0.10202627615666406, "language_loss": 0.93765342, "learning_rate": 0.0009806461112779462, "loss": 0.95106417, "num_input_tokens_seen": 50021136, "router_z_loss_mlp": 0.20385742, "routerloss_mlp": 0.0, "step": 604, "time_per_iteration": 2.6618311405181885 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01291209, "balance_loss_mlp": 1.27080083, "diversity_loss_mlp": 0.0, "epoch": 0.11639091958445556, "flos": 454203168768.0, "grad_norm": 0.13219567018011513, "language_loss": 0.87928259, "learning_rate": 0.0009805601784566814, "loss": 0.89219463, "num_input_tokens_seen": 50083888, "router_z_loss_mlp": 0.20397949, "routerloss_mlp": 0.0, "step": 605, "time_per_iteration": 2.4783012866973877 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01229751, "balance_loss_mlp": 1.20996237, "diversity_loss_mlp": 0.0, "epoch": 0.11658330126971912, "flos": 555081859584.0, "grad_norm": 0.07794567116482086, "language_loss": 0.95705628, "learning_rate": 0.0009804740590654089, "loss": 0.9693538, "num_input_tokens_seen": 50151744, "router_z_loss_mlp": 0.19787598, "routerloss_mlp": 0.0, "step": 606, "time_per_iteration": 2.6886532306671143 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01155761, "balance_loss_mlp": 1.13543582, "diversity_loss_mlp": 0.0, "epoch": 0.11677568295498268, "flos": 716340049920.0, "grad_norm": 0.09113538166915294, "language_loss": 0.90117687, "learning_rate": 0.0009803877531375635, "loss": 0.91273439, "num_input_tokens_seen": 50221248, "router_z_loss_mlp": 0.20336914, "routerloss_mlp": 0.0, "step": 607, "time_per_iteration": 2.877068281173706 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01127783, "balance_loss_mlp": 1.1072073, "diversity_loss_mlp": 0.0, "epoch": 0.11696806464024626, "flos": 609758668800.0, "grad_norm": 0.0886917383310614, "language_loss": 0.90959686, "learning_rate": 0.0009803012607066523, "loss": 0.92087471, "num_input_tokens_seen": 50293792, "router_z_loss_mlp": 0.20581055, "routerloss_mlp": 0.0, "step": 608, "time_per_iteration": 2.7187952995300293 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110833, "balance_loss_mlp": 1.08786178, "diversity_loss_mlp": 0.0, "epoch": 0.11716044632550981, "flos": 520384103424.0, "grad_norm": 0.061304878637031934, "language_loss": 0.89645171, "learning_rate": 0.0009802145818062543, "loss": 0.90753502, "num_input_tokens_seen": 50367760, "router_z_loss_mlp": 0.20471191, "routerloss_mlp": 0.0, "step": 609, "time_per_iteration": 2.692622423171997 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00920288, "balance_loss_mlp": 1.57755673, "diversity_loss_mlp": 0.22646153, "epoch": 0.11735282801077337, "flos": 507493859328.0, "grad_norm": 0.03934500472587961, "language_loss": 0.91726142, "learning_rate": 0.0009801277164700212, "loss": 0.92646432, "num_input_tokens_seen": 50435664, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01827916, "step": 610, "time_per_iteration": 2.5983645915985107 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100014, "balance_loss_mlp": 1.07810283, "diversity_loss_mlp": 0.0, "epoch": 0.11754520969603693, "flos": 686638342656.0, "grad_norm": 0.11493980483313035, "language_loss": 0.90203917, "learning_rate": 0.0009800406647316776, "loss": 0.91303933, "num_input_tokens_seen": 50514144, "router_z_loss_mlp": 0.21911621, "routerloss_mlp": 0.0, "step": 611, "time_per_iteration": 2.83890438079834 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02350268, "balance_loss_mlp": 2.30563617, "diversity_loss_mlp": 0.0, "epoch": 0.1177375913813005, "flos": 1542487421952.0, "grad_norm": 0.20114955038596882, "language_loss": 0.76914459, "learning_rate": 0.0009799534266250196, "loss": 0.7926473, "num_input_tokens_seen": 50738448, "router_z_loss_mlp": 0.44726562, "routerloss_mlp": 0.0, "step": 612, "time_per_iteration": 4.795763254165649 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111114, "balance_loss_mlp": 1.09067178, "diversity_loss_mlp": 0.0, "epoch": 0.11792997306656407, "flos": 520522495488.0, "grad_norm": 0.10624240262278996, "language_loss": 0.88978302, "learning_rate": 0.000979866002183916, "loss": 0.9008944, "num_input_tokens_seen": 50809328, "router_z_loss_mlp": 0.20471191, "routerloss_mlp": 0.0, "step": 613, "time_per_iteration": 2.660820484161377 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121358, "balance_loss_mlp": 1.10140252, "diversity_loss_mlp": 0.0, "epoch": 0.11812235475182763, "flos": 666281189376.0, "grad_norm": 0.11793468153173196, "language_loss": 0.90023279, "learning_rate": 0.0009797783914423082, "loss": 0.91144633, "num_input_tokens_seen": 50887728, "router_z_loss_mlp": 0.19946289, "routerloss_mlp": 0.0, "step": 614, "time_per_iteration": 2.8052501678466797 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01154456, "balance_loss_mlp": 1.13508475, "diversity_loss_mlp": 0.0, "epoch": 0.11831473643709119, "flos": 621317122560.0, "grad_norm": 0.09232041353489327, "language_loss": 0.84365702, "learning_rate": 0.0009796905944342094, "loss": 0.8552016, "num_input_tokens_seen": 50966160, "router_z_loss_mlp": 0.19360352, "routerloss_mlp": 0.0, "step": 615, "time_per_iteration": 2.829193115234375 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01164283, "balance_loss_mlp": 1.14475632, "diversity_loss_mlp": 0.0, "epoch": 0.11850711812235475, "flos": 456688710144.0, "grad_norm": 0.08204462941928636, "language_loss": 0.88193601, "learning_rate": 0.0009796026111937057, "loss": 0.89357883, "num_input_tokens_seen": 51035712, "router_z_loss_mlp": 0.19519043, "routerloss_mlp": 0.0, "step": 616, "time_per_iteration": 2.5868873596191406 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01165697, "balance_loss_mlp": 1.14656377, "diversity_loss_mlp": 0.0, "epoch": 0.11869949980761832, "flos": 513863543808.0, "grad_norm": 0.08667467412120618, "language_loss": 0.88612103, "learning_rate": 0.0009795144417549552, "loss": 0.89777797, "num_input_tokens_seen": 51108656, "router_z_loss_mlp": 0.19128418, "routerloss_mlp": 0.0, "step": 617, "time_per_iteration": 2.689771890640259 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01163262, "balance_loss_mlp": 1.14452195, "diversity_loss_mlp": 0.0, "epoch": 0.11889188149288188, "flos": 535016171520.0, "grad_norm": 0.07824422885129345, "language_loss": 0.8978498, "learning_rate": 0.0009794260861521883, "loss": 0.90948236, "num_input_tokens_seen": 51185552, "router_z_loss_mlp": 0.18737793, "routerloss_mlp": 0.0, "step": 618, "time_per_iteration": 2.78352689743042 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01154292, "balance_loss_mlp": 1.13528955, "diversity_loss_mlp": 0.0, "epoch": 0.11908426317814544, "flos": 498603755520.0, "grad_norm": 0.09960243519509318, "language_loss": 0.86907887, "learning_rate": 0.0009793375444197075, "loss": 0.88062179, "num_input_tokens_seen": 51255808, "router_z_loss_mlp": 0.18994141, "routerloss_mlp": 0.0, "step": 619, "time_per_iteration": 2.618597984313965 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01159739, "balance_loss_mlp": 1.14053416, "diversity_loss_mlp": 0.0, "epoch": 0.119276644863409, "flos": 659891681280.0, "grad_norm": 0.09155899478389973, "language_loss": 0.85016847, "learning_rate": 0.000979248816591888, "loss": 0.86176586, "num_input_tokens_seen": 51329408, "router_z_loss_mlp": 0.1920166, "routerloss_mlp": 0.0, "step": 620, "time_per_iteration": 2.7570278644561768 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01145713, "balance_loss_mlp": 1.12721133, "diversity_loss_mlp": 0.0, "epoch": 0.11946902654867257, "flos": 758746621440.0, "grad_norm": 0.1108991519321712, "language_loss": 0.86349535, "learning_rate": 0.0009791599027031766, "loss": 0.87495244, "num_input_tokens_seen": 51408784, "router_z_loss_mlp": 0.18493652, "routerloss_mlp": 0.0, "step": 621, "time_per_iteration": 3.2095139026641846 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01137126, "balance_loss_mlp": 1.11841059, "diversity_loss_mlp": 0.0, "epoch": 0.11966140823393613, "flos": 680999892480.0, "grad_norm": 0.09815511109151757, "language_loss": 0.86187375, "learning_rate": 0.0009790708027880932, "loss": 0.873245, "num_input_tokens_seen": 51482592, "router_z_loss_mlp": 0.18713379, "routerloss_mlp": 0.0, "step": 622, "time_per_iteration": 2.878537654876709 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01885107, "balance_loss_mlp": 1.84448004, "diversity_loss_mlp": 0.0, "epoch": 0.11985378991919969, "flos": 1451071853568.0, "grad_norm": 0.060338107853692736, "language_loss": 0.77427292, "learning_rate": 0.0009789815168812293, "loss": 0.79312396, "num_input_tokens_seen": 51712240, "router_z_loss_mlp": 0.40625, "routerloss_mlp": 0.0, "step": 623, "time_per_iteration": 4.854407787322998 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01147675, "balance_loss_mlp": 1.12785053, "diversity_loss_mlp": 0.0, "epoch": 0.12004617160446325, "flos": 527848441344.0, "grad_norm": 0.08227936779447462, "language_loss": 0.9313252, "learning_rate": 0.0009788920450172487, "loss": 0.94280195, "num_input_tokens_seen": 51781440, "router_z_loss_mlp": 0.19812012, "routerloss_mlp": 0.0, "step": 624, "time_per_iteration": 2.633763551712036 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01173257, "balance_loss_mlp": 1.15283692, "diversity_loss_mlp": 0.0, "epoch": 0.12023855328972682, "flos": 474219297792.0, "grad_norm": 0.08898942147955141, "language_loss": 0.90448737, "learning_rate": 0.0009788023872308875, "loss": 0.91621995, "num_input_tokens_seen": 51845424, "router_z_loss_mlp": 0.20410156, "routerloss_mlp": 0.0, "step": 625, "time_per_iteration": 2.5277719497680664 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01862648, "balance_loss_mlp": 1.82163978, "diversity_loss_mlp": 0.0, "epoch": 0.12043093497499038, "flos": 1531771430400.0, "grad_norm": 0.06145643913195344, "language_loss": 0.75428998, "learning_rate": 0.0009787125435569539, "loss": 0.77291644, "num_input_tokens_seen": 52076496, "router_z_loss_mlp": 0.41015625, "routerloss_mlp": 0.0, "step": 626, "time_per_iteration": 4.746332883834839 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01165065, "balance_loss_mlp": 1.1446321, "diversity_loss_mlp": 0.0, "epoch": 0.12062331666025394, "flos": 539839323648.0, "grad_norm": 0.07179626691480034, "language_loss": 0.93775636, "learning_rate": 0.0009786225140303285, "loss": 0.94940698, "num_input_tokens_seen": 52143072, "router_z_loss_mlp": 0.2043457, "routerloss_mlp": 0.0, "step": 627, "time_per_iteration": 2.650980234146118 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01154743, "balance_loss_mlp": 1.13354802, "diversity_loss_mlp": 0.0, "epoch": 0.1208156983455175, "flos": 511906604544.0, "grad_norm": 0.1000912175423248, "language_loss": 0.91955918, "learning_rate": 0.0009785322986859634, "loss": 0.93110657, "num_input_tokens_seen": 52211888, "router_z_loss_mlp": 0.21191406, "routerloss_mlp": 0.0, "step": 628, "time_per_iteration": 2.699179172515869 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0098085, "balance_loss_mlp": 1.69793713, "diversity_loss_mlp": 0.22907162, "epoch": 0.12100808003078108, "flos": 596473072128.0, "grad_norm": 0.03434932946066091, "language_loss": 0.92752671, "learning_rate": 0.0009784418975588838, "loss": 0.93733525, "num_input_tokens_seen": 52283696, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01734566, "step": 629, "time_per_iteration": 2.7467246055603027 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01131113, "balance_loss_mlp": 1.10905957, "diversity_loss_mlp": 0.0, "epoch": 0.12120046171604464, "flos": 522970960896.0, "grad_norm": 0.08662072407619689, "language_loss": 0.93157279, "learning_rate": 0.0009783513106841862, "loss": 0.94288397, "num_input_tokens_seen": 52358624, "router_z_loss_mlp": 0.22070312, "routerloss_mlp": 0.0, "step": 630, "time_per_iteration": 2.699862003326416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01893774, "balance_loss_mlp": 1.85181284, "diversity_loss_mlp": 0.0, "epoch": 0.1213928434013082, "flos": 1554463249920.0, "grad_norm": 0.08318726834589595, "language_loss": 0.76732707, "learning_rate": 0.00097826053809704, "loss": 0.78626478, "num_input_tokens_seen": 52591248, "router_z_loss_mlp": 0.41992188, "routerloss_mlp": 0.0, "step": 631, "time_per_iteration": 4.952157258987427 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01129662, "balance_loss_mlp": 1.10740614, "diversity_loss_mlp": 0.0, "epoch": 0.12158522508657175, "flos": 495391749120.0, "grad_norm": 0.08011431594745816, "language_loss": 0.87836802, "learning_rate": 0.0009781695798326854, "loss": 0.88966465, "num_input_tokens_seen": 52659920, "router_z_loss_mlp": 0.22265625, "routerloss_mlp": 0.0, "step": 632, "time_per_iteration": 2.5692520141601562 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112411, "balance_loss_mlp": 1.10132909, "diversity_loss_mlp": 0.0, "epoch": 0.12177760677183531, "flos": 475585592832.0, "grad_norm": 0.08866631591317527, "language_loss": 0.87804729, "learning_rate": 0.0009780784359264365, "loss": 0.88928837, "num_input_tokens_seen": 52728832, "router_z_loss_mlp": 0.2277832, "routerloss_mlp": 0.0, "step": 633, "time_per_iteration": 2.6267781257629395 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00719882, "balance_loss_mlp": 1.16367078, "diversity_loss_mlp": 0.22089316, "epoch": 0.12196998845709889, "flos": 1468458906624.0, "grad_norm": 0.0030158712959469035, "language_loss": 0.74188697, "learning_rate": 0.0009779871064136778, "loss": 0.74908578, "num_input_tokens_seen": 52949776, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.02760048, "step": 634, "time_per_iteration": 4.819004535675049 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00956665, "balance_loss_mlp": 1.64561963, "diversity_loss_mlp": 0.23289478, "epoch": 0.12216237014236245, "flos": 586572378624.0, "grad_norm": 0.029780004210258365, "language_loss": 0.87410563, "learning_rate": 0.000977895591329867, "loss": 0.88367236, "num_input_tokens_seen": 53027184, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.017408, "step": 635, "time_per_iteration": 2.8417630195617676 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111829, "balance_loss_mlp": 1.09035909, "diversity_loss_mlp": 0.0, "epoch": 0.12235475182762601, "flos": 597997582848.0, "grad_norm": 0.07301537581986137, "language_loss": 0.86799347, "learning_rate": 0.000977803890710533, "loss": 0.87911177, "num_input_tokens_seen": 53101072, "router_z_loss_mlp": 0.21472168, "routerloss_mlp": 0.0, "step": 636, "time_per_iteration": 2.721245765686035 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105977, "balance_loss_mlp": 1.08507979, "diversity_loss_mlp": 0.0, "epoch": 0.12254713351288957, "flos": 497741469696.0, "grad_norm": 0.0646034576227674, "language_loss": 0.93395561, "learning_rate": 0.0009777120045912774, "loss": 0.94501537, "num_input_tokens_seen": 53172992, "router_z_loss_mlp": 0.20898438, "routerloss_mlp": 0.0, "step": 637, "time_per_iteration": 2.5976381301879883 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01114015, "balance_loss_mlp": 1.09267688, "diversity_loss_mlp": 0.0, "epoch": 0.12273951519815314, "flos": 605847361536.0, "grad_norm": 0.07520229878174765, "language_loss": 0.89586985, "learning_rate": 0.0009776199330077736, "loss": 0.90700996, "num_input_tokens_seen": 53248256, "router_z_loss_mlp": 0.21362305, "routerloss_mlp": 0.0, "step": 638, "time_per_iteration": 2.7055575847625732 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01127487, "balance_loss_mlp": 1.10741186, "diversity_loss_mlp": 0.0, "epoch": 0.1229318968834167, "flos": 597859190784.0, "grad_norm": 0.08952902399696973, "language_loss": 0.91934389, "learning_rate": 0.0009775276759957667, "loss": 0.93061876, "num_input_tokens_seen": 53318960, "router_z_loss_mlp": 0.20068359, "routerloss_mlp": 0.0, "step": 639, "time_per_iteration": 2.703442096710205 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0113385, "balance_loss_mlp": 1.11285698, "diversity_loss_mlp": 0.0, "epoch": 0.12312427856868026, "flos": 678383299584.0, "grad_norm": 0.08734236555353025, "language_loss": 0.8993817, "learning_rate": 0.0009774352335910745, "loss": 0.91072023, "num_input_tokens_seen": 53389120, "router_z_loss_mlp": 0.21008301, "routerloss_mlp": 0.0, "step": 640, "time_per_iteration": 2.798133373260498 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01133691, "balance_loss_mlp": 1.11327052, "diversity_loss_mlp": 0.0, "epoch": 0.12331666025394382, "flos": 608933458944.0, "grad_norm": 0.08010684820371014, "language_loss": 0.94195282, "learning_rate": 0.000977342605829586, "loss": 0.95328975, "num_input_tokens_seen": 53459056, "router_z_loss_mlp": 0.20422363, "routerloss_mlp": 0.0, "step": 641, "time_per_iteration": 2.72929310798645 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01167832, "balance_loss_mlp": 1.14699411, "diversity_loss_mlp": 0.0, "epoch": 0.12350904193920739, "flos": 762504855552.0, "grad_norm": 0.08202605728626432, "language_loss": 0.85741401, "learning_rate": 0.0009772497927472623, "loss": 0.86909235, "num_input_tokens_seen": 53541552, "router_z_loss_mlp": 0.20837402, "routerloss_mlp": 0.0, "step": 642, "time_per_iteration": 3.071017265319824 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01166824, "balance_loss_mlp": 1.14637995, "diversity_loss_mlp": 0.0, "epoch": 0.12370142362447095, "flos": 540968481792.0, "grad_norm": 0.0829252807022359, "language_loss": 0.84863311, "learning_rate": 0.0009771567943801368, "loss": 0.86030138, "num_input_tokens_seen": 53611520, "router_z_loss_mlp": 0.20446777, "routerloss_mlp": 0.0, "step": 643, "time_per_iteration": 2.667830228805542 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01180894, "balance_loss_mlp": 1.16058123, "diversity_loss_mlp": 0.0, "epoch": 0.12389380530973451, "flos": 548128871424.0, "grad_norm": 0.07304892670416417, "language_loss": 0.89067769, "learning_rate": 0.0009770636107643152, "loss": 0.90248668, "num_input_tokens_seen": 53683888, "router_z_loss_mlp": 0.203125, "routerloss_mlp": 0.0, "step": 644, "time_per_iteration": 2.715703010559082 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01187033, "balance_loss_mlp": 1.16633821, "diversity_loss_mlp": 0.0, "epoch": 0.12408618699499807, "flos": 540308828160.0, "grad_norm": 0.07624328698635177, "language_loss": 0.87043303, "learning_rate": 0.0009769702419359738, "loss": 0.88230342, "num_input_tokens_seen": 53751888, "router_z_loss_mlp": 0.20703125, "routerloss_mlp": 0.0, "step": 645, "time_per_iteration": 2.645270586013794 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01199535, "balance_loss_mlp": 1.17913866, "diversity_loss_mlp": 0.0, "epoch": 0.12427856868026164, "flos": 745792137216.0, "grad_norm": 0.10325279424343262, "language_loss": 0.88927197, "learning_rate": 0.000976876687931362, "loss": 0.90126729, "num_input_tokens_seen": 53827648, "router_z_loss_mlp": 0.20385742, "routerloss_mlp": 0.0, "step": 646, "time_per_iteration": 2.9558987617492676 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01154414, "balance_loss_mlp": 1.13427997, "diversity_loss_mlp": 0.0, "epoch": 0.1244709503655252, "flos": 533716687872.0, "grad_norm": 0.10259074887379964, "language_loss": 0.84658372, "learning_rate": 0.0009767829487868005, "loss": 0.85812783, "num_input_tokens_seen": 53896400, "router_z_loss_mlp": 0.20129395, "routerloss_mlp": 0.0, "step": 647, "time_per_iteration": 2.593254566192627 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01165839, "balance_loss_mlp": 1.14557362, "diversity_loss_mlp": 0.0, "epoch": 0.12466333205078876, "flos": 508099184640.0, "grad_norm": 0.08660672395493044, "language_loss": 0.88729513, "learning_rate": 0.000976689024538682, "loss": 0.8989535, "num_input_tokens_seen": 53965904, "router_z_loss_mlp": 0.20263672, "routerloss_mlp": 0.0, "step": 648, "time_per_iteration": 2.6087043285369873 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01147043, "balance_loss_mlp": 1.12564492, "diversity_loss_mlp": 0.0, "epoch": 0.12485571373605232, "flos": 681345686016.0, "grad_norm": 0.09471610460140056, "language_loss": 0.86980593, "learning_rate": 0.0009765949152234716, "loss": 0.88127637, "num_input_tokens_seen": 54049792, "router_z_loss_mlp": 0.21411133, "routerloss_mlp": 0.0, "step": 649, "time_per_iteration": 2.8878984451293945 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02130912, "balance_loss_mlp": 2.08723378, "diversity_loss_mlp": 0.0, "epoch": 0.1250480954213159, "flos": 1330159781376.0, "grad_norm": 0.17488169385486374, "language_loss": 0.78686082, "learning_rate": 0.0009765006208777055, "loss": 0.80816996, "num_input_tokens_seen": 54262432, "router_z_loss_mlp": 0.4375, "routerloss_mlp": 0.0, "step": 650, "time_per_iteration": 4.7227959632873535 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01125186, "balance_loss_mlp": 1.10393071, "diversity_loss_mlp": 0.0, "epoch": 0.12524047710657946, "flos": 938550758400.0, "grad_norm": 0.09783498118048492, "language_loss": 0.81436628, "learning_rate": 0.0009764061415379919, "loss": 0.82561815, "num_input_tokens_seen": 54351568, "router_z_loss_mlp": 0.21276855, "routerloss_mlp": 0.0, "step": 651, "time_per_iteration": 3.2849485874176025 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135606, "balance_loss_mlp": 1.11419618, "diversity_loss_mlp": 0.0, "epoch": 0.12543285879184302, "flos": 513893279232.0, "grad_norm": 0.08568090703098526, "language_loss": 0.88376707, "learning_rate": 0.0009763114772410109, "loss": 0.89512312, "num_input_tokens_seen": 54418944, "router_z_loss_mlp": 0.21435547, "routerloss_mlp": 0.0, "step": 652, "time_per_iteration": 2.640482187271118 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01147244, "balance_loss_mlp": 1.12633479, "diversity_loss_mlp": 0.0, "epoch": 0.12562524047710658, "flos": 718328922624.0, "grad_norm": 0.0799999486499222, "language_loss": 0.86490756, "learning_rate": 0.0009762166280235146, "loss": 0.87638003, "num_input_tokens_seen": 54495312, "router_z_loss_mlp": 0.20910645, "routerloss_mlp": 0.0, "step": 653, "time_per_iteration": 2.9535903930664062 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01188128, "balance_loss_mlp": 1.16659844, "diversity_loss_mlp": 0.0, "epoch": 0.12581762216237014, "flos": 563712431616.0, "grad_norm": 0.09522027236447655, "language_loss": 0.86765033, "learning_rate": 0.0009761215939223267, "loss": 0.87953162, "num_input_tokens_seen": 54566832, "router_z_loss_mlp": 0.2154541, "routerloss_mlp": 0.0, "step": 654, "time_per_iteration": 2.7124929428100586 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01186032, "balance_loss_mlp": 1.16533732, "diversity_loss_mlp": 0.0, "epoch": 0.1260100038476337, "flos": 481893608448.0, "grad_norm": 0.11212167432887624, "language_loss": 0.85993934, "learning_rate": 0.0009760263749743428, "loss": 0.87179965, "num_input_tokens_seen": 54632128, "router_z_loss_mlp": 0.20690918, "routerloss_mlp": 0.0, "step": 655, "time_per_iteration": 2.5919461250305176 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01171572, "balance_loss_mlp": 1.1518662, "diversity_loss_mlp": 0.0, "epoch": 0.12620238553289725, "flos": 575555010048.0, "grad_norm": 0.09226162692886594, "language_loss": 0.89700639, "learning_rate": 0.0009759309712165299, "loss": 0.9087221, "num_input_tokens_seen": 54707600, "router_z_loss_mlp": 0.19702148, "routerloss_mlp": 0.0, "step": 656, "time_per_iteration": 2.746537685394287 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01161192, "balance_loss_mlp": 1.14149833, "diversity_loss_mlp": 0.0, "epoch": 0.12639476721816084, "flos": 531164335104.0, "grad_norm": 0.08627335840647962, "language_loss": 0.92326117, "learning_rate": 0.0009758353826859272, "loss": 0.9348731, "num_input_tokens_seen": 54776704, "router_z_loss_mlp": 0.19689941, "routerloss_mlp": 0.0, "step": 657, "time_per_iteration": 2.5861480236053467 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128682, "balance_loss_mlp": 1.10790431, "diversity_loss_mlp": 0.0, "epoch": 0.1265871489034244, "flos": 689968917504.0, "grad_norm": 0.1059978443595565, "language_loss": 0.88603538, "learning_rate": 0.0009757396094196456, "loss": 0.89732224, "num_input_tokens_seen": 54851744, "router_z_loss_mlp": 0.20788574, "routerloss_mlp": 0.0, "step": 658, "time_per_iteration": 2.8773136138916016 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130444, "balance_loss_mlp": 1.11040533, "diversity_loss_mlp": 0.0, "epoch": 0.12677953058868796, "flos": 537138667008.0, "grad_norm": 0.12293029558515219, "language_loss": 0.83426332, "learning_rate": 0.0009756436514548673, "loss": 0.8455677, "num_input_tokens_seen": 54932576, "router_z_loss_mlp": 0.20031738, "routerloss_mlp": 0.0, "step": 659, "time_per_iteration": 2.810722589492798 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01134733, "balance_loss_mlp": 1.11438441, "diversity_loss_mlp": 0.0, "epoch": 0.12697191227395152, "flos": 519022577664.0, "grad_norm": 0.06793027871708798, "language_loss": 0.87658846, "learning_rate": 0.0009755475088288466, "loss": 0.88793576, "num_input_tokens_seen": 55007296, "router_z_loss_mlp": 0.20349121, "routerloss_mlp": 0.0, "step": 660, "time_per_iteration": 2.7121376991271973 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01147973, "balance_loss_mlp": 1.12785089, "diversity_loss_mlp": 0.0, "epoch": 0.12716429395921508, "flos": 566605808640.0, "grad_norm": 0.08710392398912287, "language_loss": 0.89421189, "learning_rate": 0.0009754511815789095, "loss": 0.90569162, "num_input_tokens_seen": 55079312, "router_z_loss_mlp": 0.20117188, "routerloss_mlp": 0.0, "step": 661, "time_per_iteration": 2.777318239212036 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01162586, "balance_loss_mlp": 1.14171267, "diversity_loss_mlp": 0.0, "epoch": 0.12735667564447864, "flos": 514103251968.0, "grad_norm": 0.08537034247511402, "language_loss": 0.84716892, "learning_rate": 0.0009753546697424533, "loss": 0.85879481, "num_input_tokens_seen": 55151824, "router_z_loss_mlp": 0.2088623, "routerloss_mlp": 0.0, "step": 662, "time_per_iteration": 2.6664726734161377 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01169368, "balance_loss_mlp": 1.14935231, "diversity_loss_mlp": 0.0, "epoch": 0.1275490573297422, "flos": 541282341888.0, "grad_norm": 0.08593929583832248, "language_loss": 0.89815515, "learning_rate": 0.0009752579733569475, "loss": 0.90984881, "num_input_tokens_seen": 55224368, "router_z_loss_mlp": 0.20019531, "routerloss_mlp": 0.0, "step": 663, "time_per_iteration": 2.695844888687134 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.02192512, "balance_loss_mlp": 2.16352034, "diversity_loss_mlp": 0.0, "epoch": 0.12774143901500576, "flos": 1558700900352.0, "grad_norm": 0.2093028146020386, "language_loss": 0.74881387, "learning_rate": 0.0009751610924599328, "loss": 0.77073896, "num_input_tokens_seen": 55453584, "router_z_loss_mlp": 0.2890625, "routerloss_mlp": 0.0, "step": 664, "time_per_iteration": 4.96467137336731 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00927072, "balance_loss_mlp": 1.59828615, "diversity_loss_mlp": 0.21952696, "epoch": 0.12793382070026935, "flos": 613744128000.0, "grad_norm": 0.040572636524321984, "language_loss": 0.8949101, "learning_rate": 0.0009750640270890217, "loss": 0.90418077, "num_input_tokens_seen": 55528000, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01816532, "step": 665, "time_per_iteration": 2.7632246017456055 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01241186, "balance_loss_mlp": 1.22053885, "diversity_loss_mlp": 0.0, "epoch": 0.1281262023855329, "flos": 707731499520.0, "grad_norm": 0.08846289988129392, "language_loss": 0.95572138, "learning_rate": 0.0009749667772818983, "loss": 0.96813321, "num_input_tokens_seen": 55612416, "router_z_loss_mlp": 0.20654297, "routerloss_mlp": 0.0, "step": 666, "time_per_iteration": 3.037458896636963 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0183198, "balance_loss_mlp": 1.80241597, "diversity_loss_mlp": 0.0, "epoch": 0.12831858407079647, "flos": 1425034404864.0, "grad_norm": 0.11554481164154014, "language_loss": 0.76935941, "learning_rate": 0.0009748693430763185, "loss": 0.7876792, "num_input_tokens_seen": 55843664, "router_z_loss_mlp": 0.29492188, "routerloss_mlp": 0.0, "step": 667, "time_per_iteration": 4.810182332992554 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01244511, "balance_loss_mlp": 1.22299325, "diversity_loss_mlp": 0.0, "epoch": 0.12851096575606002, "flos": 449098463232.0, "grad_norm": 0.09137997717488894, "language_loss": 0.94816601, "learning_rate": 0.0009747717245101093, "loss": 0.9606111, "num_input_tokens_seen": 55909072, "router_z_loss_mlp": 0.21520996, "routerloss_mlp": 0.0, "step": 668, "time_per_iteration": 2.552507162094116 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00917856, "balance_loss_mlp": 1.58052325, "diversity_loss_mlp": 0.21830653, "epoch": 0.12870334744132358, "flos": 479939240448.0, "grad_norm": 0.03508480239171642, "language_loss": 0.8457346, "learning_rate": 0.00097467392162117, "loss": 0.85491318, "num_input_tokens_seen": 55978544, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01844162, "step": 669, "time_per_iteration": 2.6064391136169434 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01242109, "balance_loss_mlp": 1.21882796, "diversity_loss_mlp": 0.0, "epoch": 0.12889572912658714, "flos": 638936543232.0, "grad_norm": 0.1666980552990896, "language_loss": 0.90609741, "learning_rate": 0.0009745759344474708, "loss": 0.91851848, "num_input_tokens_seen": 56054144, "router_z_loss_mlp": 0.23266602, "routerloss_mlp": 0.0, "step": 670, "time_per_iteration": 2.826202392578125 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01229033, "balance_loss_mlp": 1.2077179, "diversity_loss_mlp": 0.0, "epoch": 0.1290881108118507, "flos": 509944896000.0, "grad_norm": 0.09671049007121679, "language_loss": 0.88974905, "learning_rate": 0.0009744777630270536, "loss": 0.90203935, "num_input_tokens_seen": 56120960, "router_z_loss_mlp": 0.21337891, "routerloss_mlp": 0.0, "step": 671, "time_per_iteration": 2.578334331512451 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01233527, "balance_loss_mlp": 1.21067417, "diversity_loss_mlp": 0.0, "epoch": 0.12928049249711426, "flos": 671054782464.0, "grad_norm": 0.08999527722625096, "language_loss": 0.92790663, "learning_rate": 0.000974379407398032, "loss": 0.94024187, "num_input_tokens_seen": 56202560, "router_z_loss_mlp": 0.22839355, "routerloss_mlp": 0.0, "step": 672, "time_per_iteration": 2.8661158084869385 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01237675, "balance_loss_mlp": 1.21589506, "diversity_loss_mlp": 0.0, "epoch": 0.12947287418237785, "flos": 793525870080.0, "grad_norm": 0.09653126460783178, "language_loss": 0.81875724, "learning_rate": 0.0009742808675985913, "loss": 0.83113402, "num_input_tokens_seen": 56289456, "router_z_loss_mlp": 0.21801758, "routerloss_mlp": 0.0, "step": 673, "time_per_iteration": 3.0861356258392334 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01260533, "balance_loss_mlp": 1.23754919, "diversity_loss_mlp": 0.0, "epoch": 0.1296652558676414, "flos": 485466462720.0, "grad_norm": 0.08653130412501808, "language_loss": 0.90219223, "learning_rate": 0.0009741821436669876, "loss": 0.91479754, "num_input_tokens_seen": 56354480, "router_z_loss_mlp": 0.2298584, "routerloss_mlp": 0.0, "step": 674, "time_per_iteration": 2.5609960556030273 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01267597, "balance_loss_mlp": 1.24489975, "diversity_loss_mlp": 0.0, "epoch": 0.12985763755290497, "flos": 453459451392.0, "grad_norm": 0.09623752325881015, "language_loss": 0.91791725, "learning_rate": 0.0009740832356415492, "loss": 0.93059325, "num_input_tokens_seen": 56418944, "router_z_loss_mlp": 0.22680664, "routerloss_mlp": 0.0, "step": 675, "time_per_iteration": 2.544027805328369 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01295128, "balance_loss_mlp": 1.27278781, "diversity_loss_mlp": 0.0, "epoch": 0.13005001923816853, "flos": 825061178880.0, "grad_norm": 0.08903369590662558, "language_loss": 0.87403589, "learning_rate": 0.0009739841435606756, "loss": 0.88698715, "num_input_tokens_seen": 56492368, "router_z_loss_mlp": 0.22338867, "routerloss_mlp": 0.0, "step": 676, "time_per_iteration": 2.9931325912475586 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01261461, "balance_loss_mlp": 1.23933589, "diversity_loss_mlp": 0.0, "epoch": 0.1302424009234321, "flos": 531381648384.0, "grad_norm": 0.0602287995404217, "language_loss": 0.89557111, "learning_rate": 0.0009738848674628377, "loss": 0.90818572, "num_input_tokens_seen": 56568128, "router_z_loss_mlp": 0.22131348, "routerloss_mlp": 0.0, "step": 677, "time_per_iteration": 2.7290966510772705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01264602, "balance_loss_mlp": 1.24307275, "diversity_loss_mlp": 0.0, "epoch": 0.13043478260869565, "flos": 525884161536.0, "grad_norm": 0.10468610894957399, "language_loss": 0.88751101, "learning_rate": 0.000973785407386578, "loss": 0.90015703, "num_input_tokens_seen": 56646448, "router_z_loss_mlp": 0.2154541, "routerloss_mlp": 0.0, "step": 678, "time_per_iteration": 2.7950329780578613 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00969584, "balance_loss_mlp": 1.6979661, "diversity_loss_mlp": 0.20886885, "epoch": 0.1306271642939592, "flos": 626172208128.0, "grad_norm": 0.03344489204860934, "language_loss": 0.86933386, "learning_rate": 0.0009736857633705103, "loss": 0.87902969, "num_input_tokens_seen": 56732080, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01616703, "step": 679, "time_per_iteration": 2.8691866397857666 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01193718, "balance_loss_mlp": 1.17283261, "diversity_loss_mlp": 0.0, "epoch": 0.13081954597922277, "flos": 550718300160.0, "grad_norm": 0.08130386374469858, "language_loss": 0.92363989, "learning_rate": 0.0009735859354533196, "loss": 0.93557703, "num_input_tokens_seen": 56804432, "router_z_loss_mlp": 0.2088623, "routerloss_mlp": 0.0, "step": 680, "time_per_iteration": 2.6832337379455566 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01155917, "balance_loss_mlp": 1.13447094, "diversity_loss_mlp": 0.0, "epoch": 0.13101192766448633, "flos": 536911441920.0, "grad_norm": 0.0924188238597787, "language_loss": 0.91083395, "learning_rate": 0.0009734859236737628, "loss": 0.92239314, "num_input_tokens_seen": 56872512, "router_z_loss_mlp": 0.21459961, "routerloss_mlp": 0.0, "step": 681, "time_per_iteration": 2.6023473739624023 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01125397, "balance_loss_mlp": 1.10410571, "diversity_loss_mlp": 0.0, "epoch": 0.13120430934974991, "flos": 503508400128.0, "grad_norm": 0.08442474228180671, "language_loss": 0.93186569, "learning_rate": 0.0009733857280706678, "loss": 0.9431197, "num_input_tokens_seen": 56940928, "router_z_loss_mlp": 0.2130127, "routerloss_mlp": 0.0, "step": 682, "time_per_iteration": 2.5775911808013916 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00968386, "balance_loss_mlp": 1.69064701, "diversity_loss_mlp": 0.21057674, "epoch": 0.13139669103501347, "flos": 614295124992.0, "grad_norm": 0.03992508312329801, "language_loss": 0.84369749, "learning_rate": 0.000973285348682934, "loss": 0.85338134, "num_input_tokens_seen": 57012736, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01777408, "step": 683, "time_per_iteration": 2.768641233444214 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01618305, "balance_loss_mlp": 1.58530831, "diversity_loss_mlp": 0.0, "epoch": 0.13158907272027703, "flos": 1484971564032.0, "grad_norm": 0.09794042911652269, "language_loss": 0.77898371, "learning_rate": 0.0009731847855495323, "loss": 0.79516685, "num_input_tokens_seen": 57243136, "router_z_loss_mlp": 0.33007812, "routerloss_mlp": 0.0, "step": 684, "time_per_iteration": 4.802167177200317 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094162, "balance_loss_mlp": 1.07383704, "diversity_loss_mlp": 0.0, "epoch": 0.1317814544055406, "flos": 985461852672.0, "grad_norm": 0.12652995306024198, "language_loss": 0.84832728, "learning_rate": 0.0009730840387095046, "loss": 0.8592689, "num_input_tokens_seen": 57336160, "router_z_loss_mlp": 0.20324707, "routerloss_mlp": 0.0, "step": 685, "time_per_iteration": 3.2910287380218506 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112414, "balance_loss_mlp": 1.09188628, "diversity_loss_mlp": 0.0, "epoch": 0.13197383609080415, "flos": 611456076288.0, "grad_norm": 0.13012317463795417, "language_loss": 0.90537834, "learning_rate": 0.0009729831082019642, "loss": 0.91650254, "num_input_tokens_seen": 57418976, "router_z_loss_mlp": 0.20532227, "routerloss_mlp": 0.0, "step": 686, "time_per_iteration": 2.7909138202667236 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121716, "balance_loss_mlp": 1.101331, "diversity_loss_mlp": 0.0, "epoch": 0.1321662177760677, "flos": 494403181056.0, "grad_norm": 0.08096428549902779, "language_loss": 0.88353586, "learning_rate": 0.0009728819940660958, "loss": 0.89475298, "num_input_tokens_seen": 57490288, "router_z_loss_mlp": 0.20385742, "routerloss_mlp": 0.0, "step": 687, "time_per_iteration": 2.7699429988861084 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01131653, "balance_loss_mlp": 1.11135173, "diversity_loss_mlp": 0.0, "epoch": 0.13235859946133127, "flos": 495841430016.0, "grad_norm": 0.07933225152322496, "language_loss": 0.85085285, "learning_rate": 0.0009727806963411557, "loss": 0.86216938, "num_input_tokens_seen": 57556064, "router_z_loss_mlp": 0.20300293, "routerloss_mlp": 0.0, "step": 688, "time_per_iteration": 2.581984519958496 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01144043, "balance_loss_mlp": 1.12350333, "diversity_loss_mlp": 0.0, "epoch": 0.13255098114659483, "flos": 511686720000.0, "grad_norm": 0.09807362554425139, "language_loss": 0.87180853, "learning_rate": 0.000972679215066471, "loss": 0.88324893, "num_input_tokens_seen": 57627248, "router_z_loss_mlp": 0.20544434, "routerloss_mlp": 0.0, "step": 689, "time_per_iteration": 2.6538989543914795 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01148083, "balance_loss_mlp": 1.12809181, "diversity_loss_mlp": 0.0, "epoch": 0.13274336283185842, "flos": 547370472960.0, "grad_norm": 0.09247782934143206, "language_loss": 0.98983967, "learning_rate": 0.0009725775502814401, "loss": 1.00132048, "num_input_tokens_seen": 57694832, "router_z_loss_mlp": 0.19995117, "routerloss_mlp": 0.0, "step": 690, "time_per_iteration": 2.610485315322876 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01167941, "balance_loss_mlp": 1.14827132, "diversity_loss_mlp": 0.0, "epoch": 0.13293574451712198, "flos": 640772342784.0, "grad_norm": 0.08082631328369684, "language_loss": 0.84880829, "learning_rate": 0.0009724757020255327, "loss": 0.8604877, "num_input_tokens_seen": 57771776, "router_z_loss_mlp": 0.1965332, "routerloss_mlp": 0.0, "step": 691, "time_per_iteration": 2.8424370288848877 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01152001, "balance_loss_mlp": 1.13209307, "diversity_loss_mlp": 0.0, "epoch": 0.13312812620238554, "flos": 491480441856.0, "grad_norm": 0.09067820147092803, "language_loss": 0.87807095, "learning_rate": 0.0009723736703382902, "loss": 0.88959098, "num_input_tokens_seen": 57836272, "router_z_loss_mlp": 0.19897461, "routerloss_mlp": 0.0, "step": 692, "time_per_iteration": 2.5578606128692627 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01149266, "balance_loss_mlp": 1.13037133, "diversity_loss_mlp": 0.0, "epoch": 0.1333205078876491, "flos": 508944218112.0, "grad_norm": 0.07979062216362842, "language_loss": 0.82877922, "learning_rate": 0.0009722714552593244, "loss": 0.84027195, "num_input_tokens_seen": 57907232, "router_z_loss_mlp": 0.1887207, "routerloss_mlp": 0.0, "step": 693, "time_per_iteration": 2.6148533821105957 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01153464, "balance_loss_mlp": 1.13444984, "diversity_loss_mlp": 0.0, "epoch": 0.13351288957291266, "flos": 418697455104.0, "grad_norm": 0.08708336283232748, "language_loss": 0.94164526, "learning_rate": 0.000972169056828319, "loss": 0.9531799, "num_input_tokens_seen": 57969808, "router_z_loss_mlp": 0.18994141, "routerloss_mlp": 0.0, "step": 694, "time_per_iteration": 2.517944097518921 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01154915, "balance_loss_mlp": 1.1360321, "diversity_loss_mlp": 0.0, "epoch": 0.13370527125817622, "flos": 615901128192.0, "grad_norm": 0.0753733884935208, "language_loss": 0.86921358, "learning_rate": 0.0009720664750850283, "loss": 0.8807627, "num_input_tokens_seen": 58042944, "router_z_loss_mlp": 0.1887207, "routerloss_mlp": 0.0, "step": 695, "time_per_iteration": 2.8149421215057373 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01148667, "balance_loss_mlp": 1.1299628, "diversity_loss_mlp": 0.0, "epoch": 0.13389765294343978, "flos": 626038958592.0, "grad_norm": 0.09445278911045346, "language_loss": 0.92951906, "learning_rate": 0.0009719637100692784, "loss": 0.94100577, "num_input_tokens_seen": 58116080, "router_z_loss_mlp": 0.18713379, "routerloss_mlp": 0.0, "step": 696, "time_per_iteration": 2.719451904296875 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01149322, "balance_loss_mlp": 1.13098741, "diversity_loss_mlp": 0.0, "epoch": 0.13409003462870334, "flos": 609691857408.0, "grad_norm": 0.10008701466446891, "language_loss": 0.82604736, "learning_rate": 0.0009718607618209661, "loss": 0.83754057, "num_input_tokens_seen": 58197616, "router_z_loss_mlp": 0.18334961, "routerloss_mlp": 0.0, "step": 697, "time_per_iteration": 2.8692104816436768 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01148038, "balance_loss_mlp": 1.12914348, "diversity_loss_mlp": 0.0, "epoch": 0.13428241631396692, "flos": 683816546304.0, "grad_norm": 0.07908911060166324, "language_loss": 0.87701273, "learning_rate": 0.0009717576303800595, "loss": 0.88849318, "num_input_tokens_seen": 58280480, "router_z_loss_mlp": 0.1887207, "routerloss_mlp": 0.0, "step": 698, "time_per_iteration": 3.0484437942504883 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01139219, "balance_loss_mlp": 1.11988366, "diversity_loss_mlp": 0.0, "epoch": 0.13447479799923048, "flos": 508815737856.0, "grad_norm": 0.12480577454910273, "language_loss": 0.85819161, "learning_rate": 0.0009716543157865975, "loss": 0.86958385, "num_input_tokens_seen": 58352464, "router_z_loss_mlp": 0.1932373, "routerloss_mlp": 0.0, "step": 699, "time_per_iteration": 2.706787347793579 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01144768, "balance_loss_mlp": 1.12586117, "diversity_loss_mlp": 0.0, "epoch": 0.13466717968449404, "flos": 897510481920.0, "grad_norm": 0.16362357873421526, "language_loss": 0.83352965, "learning_rate": 0.0009715508180806907, "loss": 0.84497738, "num_input_tokens_seen": 58437216, "router_z_loss_mlp": 0.18896484, "routerloss_mlp": 0.0, "step": 700, "time_per_iteration": 3.1985795497894287 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01162079, "balance_loss_mlp": 1.14230227, "diversity_loss_mlp": 0.0, "epoch": 0.1348595613697576, "flos": 989938838016.0, "grad_norm": 0.08746408781150025, "language_loss": 0.90170425, "learning_rate": 0.0009714471373025202, "loss": 0.91332507, "num_input_tokens_seen": 58533152, "router_z_loss_mlp": 0.19763184, "routerloss_mlp": 0.0, "step": 701, "time_per_iteration": 3.487022638320923 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01156513, "balance_loss_mlp": 1.13656974, "diversity_loss_mlp": 0.0, "epoch": 0.13505194305502116, "flos": 487826095104.0, "grad_norm": 0.10787745491017559, "language_loss": 0.88186693, "learning_rate": 0.0009713432734923386, "loss": 0.89343208, "num_input_tokens_seen": 58601376, "router_z_loss_mlp": 0.19934082, "routerloss_mlp": 0.0, "step": 702, "time_per_iteration": 2.6239736080169678 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01167828, "balance_loss_mlp": 1.14830136, "diversity_loss_mlp": 0.0, "epoch": 0.13524432474028472, "flos": 613385851392.0, "grad_norm": 0.09670789671988574, "language_loss": 0.86879516, "learning_rate": 0.0009712392266904696, "loss": 0.88047349, "num_input_tokens_seen": 58676608, "router_z_loss_mlp": 0.19506836, "routerloss_mlp": 0.0, "step": 703, "time_per_iteration": 2.7542335987091064 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01181198, "balance_loss_mlp": 1.16149247, "diversity_loss_mlp": 0.0, "epoch": 0.13543670642554828, "flos": 904794582528.0, "grad_norm": 0.10598212751912446, "language_loss": 0.85246772, "learning_rate": 0.0009711349969373076, "loss": 0.86427975, "num_input_tokens_seen": 58759264, "router_z_loss_mlp": 0.19689941, "routerloss_mlp": 0.0, "step": 704, "time_per_iteration": 3.162461042404175 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01175522, "balance_loss_mlp": 1.15518451, "diversity_loss_mlp": 0.0, "epoch": 0.13562908811081184, "flos": 550616984064.0, "grad_norm": 0.0954290464489283, "language_loss": 0.80285007, "learning_rate": 0.0009710305842733178, "loss": 0.81460524, "num_input_tokens_seen": 58834800, "router_z_loss_mlp": 0.20336914, "routerloss_mlp": 0.0, "step": 705, "time_per_iteration": 2.7630715370178223 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01155052, "balance_loss_mlp": 1.13601446, "diversity_loss_mlp": 0.0, "epoch": 0.1358214697960754, "flos": 508044856320.0, "grad_norm": 0.09437017973872532, "language_loss": 0.89630616, "learning_rate": 0.0009709259887390373, "loss": 0.9078567, "num_input_tokens_seen": 58901712, "router_z_loss_mlp": 0.19030762, "routerloss_mlp": 0.0, "step": 706, "time_per_iteration": 2.6160268783569336 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00895019, "balance_loss_mlp": 1.55161047, "diversity_loss_mlp": 0.20666173, "epoch": 0.136013851481339, "flos": 528896107008.0, "grad_norm": 0.04273378361131697, "language_loss": 0.90874577, "learning_rate": 0.0009708212103750737, "loss": 0.91769588, "num_input_tokens_seen": 58967824, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01588319, "step": 707, "time_per_iteration": 2.594606399536133 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01180444, "balance_loss_mlp": 1.16110778, "diversity_loss_mlp": 0.0, "epoch": 0.13620623316660255, "flos": 659081152512.0, "grad_norm": 0.08814378894040824, "language_loss": 0.87522972, "learning_rate": 0.0009707162492221051, "loss": 0.88703418, "num_input_tokens_seen": 59045040, "router_z_loss_mlp": 0.19335938, "routerloss_mlp": 0.0, "step": 708, "time_per_iteration": 2.8884427547454834 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01197388, "balance_loss_mlp": 1.17801642, "diversity_loss_mlp": 0.0, "epoch": 0.1363986148518661, "flos": 671882563584.0, "grad_norm": 0.07892254834086627, "language_loss": 0.87611169, "learning_rate": 0.0009706111053208815, "loss": 0.8880856, "num_input_tokens_seen": 59117216, "router_z_loss_mlp": 0.19348145, "routerloss_mlp": 0.0, "step": 709, "time_per_iteration": 2.7824413776397705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01213311, "balance_loss_mlp": 1.19383228, "diversity_loss_mlp": 0.0, "epoch": 0.13659099653712967, "flos": 473062975488.0, "grad_norm": 0.10389736734512126, "language_loss": 0.85504246, "learning_rate": 0.0009705057787122232, "loss": 0.86717558, "num_input_tokens_seen": 59183056, "router_z_loss_mlp": 0.19458008, "routerloss_mlp": 0.0, "step": 710, "time_per_iteration": 2.529498815536499 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01178108, "balance_loss_mlp": 1.15870059, "diversity_loss_mlp": 0.0, "epoch": 0.13678337822239323, "flos": 452715734016.0, "grad_norm": 0.07975606670492637, "language_loss": 0.91293353, "learning_rate": 0.0009704002694370216, "loss": 0.92471457, "num_input_tokens_seen": 59247312, "router_z_loss_mlp": 0.19384766, "routerloss_mlp": 0.0, "step": 711, "time_per_iteration": 2.5365610122680664 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01152018, "balance_loss_mlp": 1.13282573, "diversity_loss_mlp": 0.0, "epoch": 0.13697575990765679, "flos": 519623133696.0, "grad_norm": 0.08453852441771745, "language_loss": 0.86583841, "learning_rate": 0.0009702945775362388, "loss": 0.87735862, "num_input_tokens_seen": 59317968, "router_z_loss_mlp": 0.19177246, "routerloss_mlp": 0.0, "step": 712, "time_per_iteration": 2.595674514770508 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111883, "balance_loss_mlp": 1.10022175, "diversity_loss_mlp": 0.0, "epoch": 0.13716814159292035, "flos": 480388921344.0, "grad_norm": 0.08096963371537849, "language_loss": 0.87088716, "learning_rate": 0.0009701887030509086, "loss": 0.88207549, "num_input_tokens_seen": 59387936, "router_z_loss_mlp": 0.18615723, "routerloss_mlp": 0.0, "step": 713, "time_per_iteration": 2.6124320030212402 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112657, "balance_loss_mlp": 1.09444165, "diversity_loss_mlp": 0.0, "epoch": 0.1373605232781839, "flos": 545650670592.0, "grad_norm": 0.12434454369652892, "language_loss": 0.91262931, "learning_rate": 0.0009700826460221346, "loss": 0.92375588, "num_input_tokens_seen": 59460624, "router_z_loss_mlp": 0.18225098, "routerloss_mlp": 0.0, "step": 714, "time_per_iteration": 2.674612283706665 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115832, "balance_loss_mlp": 1.09812903, "diversity_loss_mlp": 0.0, "epoch": 0.1375529049634475, "flos": 708791648256.0, "grad_norm": 0.11407804289300516, "language_loss": 0.92571628, "learning_rate": 0.0009699764064910921, "loss": 0.93687463, "num_input_tokens_seen": 59536752, "router_z_loss_mlp": 0.17712402, "routerloss_mlp": 0.0, "step": 715, "time_per_iteration": 2.8810853958129883 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121116, "balance_loss_mlp": 1.10322237, "diversity_loss_mlp": 0.0, "epoch": 0.13774528664871105, "flos": 486696936960.0, "grad_norm": 0.08940816195623212, "language_loss": 0.86826718, "learning_rate": 0.0009698699844990268, "loss": 0.87947834, "num_input_tokens_seen": 59608128, "router_z_loss_mlp": 0.17907715, "routerloss_mlp": 0.0, "step": 716, "time_per_iteration": 2.697970151901245 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01153213, "balance_loss_mlp": 1.13561809, "diversity_loss_mlp": 0.0, "epoch": 0.1379376683339746, "flos": 680199275520.0, "grad_norm": 0.07906779204708066, "language_loss": 0.88138282, "learning_rate": 0.0009697633800872555, "loss": 0.89291501, "num_input_tokens_seen": 59685120, "router_z_loss_mlp": 0.17614746, "routerloss_mlp": 0.0, "step": 717, "time_per_iteration": 2.8897392749786377 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01197417, "balance_loss_mlp": 1.1801312, "diversity_loss_mlp": 0.0, "epoch": 0.13813005001923817, "flos": 610946924544.0, "grad_norm": 0.10867682790127652, "language_loss": 0.9066782, "learning_rate": 0.0009696565932971655, "loss": 0.91865242, "num_input_tokens_seen": 59763376, "router_z_loss_mlp": 0.1730957, "routerloss_mlp": 0.0, "step": 718, "time_per_iteration": 2.8944718837738037 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01209582, "balance_loss_mlp": 1.19165277, "diversity_loss_mlp": 0.0, "epoch": 0.13832243170450173, "flos": 588729378816.0, "grad_norm": 0.0949883595308799, "language_loss": 0.89814746, "learning_rate": 0.0009695496241702153, "loss": 0.91024327, "num_input_tokens_seen": 59836800, "router_z_loss_mlp": 0.17944336, "routerloss_mlp": 0.0, "step": 719, "time_per_iteration": 2.7888894081115723 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01188478, "balance_loss_mlp": 1.17082274, "diversity_loss_mlp": 0.0, "epoch": 0.1385148133897653, "flos": 700002860544.0, "grad_norm": 0.11627833553714081, "language_loss": 0.86245799, "learning_rate": 0.0009694424727479339, "loss": 0.87434286, "num_input_tokens_seen": 59914720, "router_z_loss_mlp": 0.17687988, "routerloss_mlp": 0.0, "step": 720, "time_per_iteration": 2.901224374771118 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01157865, "balance_loss_mlp": 1.14056826, "diversity_loss_mlp": 0.0, "epoch": 0.13870719507502885, "flos": 598254543360.0, "grad_norm": 0.09369792564045784, "language_loss": 0.88928097, "learning_rate": 0.0009693351390719213, "loss": 0.90085959, "num_input_tokens_seen": 59984544, "router_z_loss_mlp": 0.1730957, "routerloss_mlp": 0.0, "step": 721, "time_per_iteration": 2.6945152282714844 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126431, "balance_loss_mlp": 1.10868096, "diversity_loss_mlp": 0.0, "epoch": 0.1388995767602924, "flos": 586572378624.0, "grad_norm": 0.07998653864580182, "language_loss": 0.90800881, "learning_rate": 0.000969227623183848, "loss": 0.91927308, "num_input_tokens_seen": 60057056, "router_z_loss_mlp": 0.1776123, "routerloss_mlp": 0.0, "step": 722, "time_per_iteration": 2.789515733718872 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110503, "balance_loss_mlp": 1.0873754, "diversity_loss_mlp": 0.0, "epoch": 0.139091958445556, "flos": 651120145920.0, "grad_norm": 0.07914116119322331, "language_loss": 0.90912664, "learning_rate": 0.0009691199251254554, "loss": 0.92017698, "num_input_tokens_seen": 60133232, "router_z_loss_mlp": 0.17663574, "routerloss_mlp": 0.0, "step": 723, "time_per_iteration": 2.8231685161590576 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0093359, "balance_loss_mlp": 1.62175167, "diversity_loss_mlp": 0.20987722, "epoch": 0.13928434013081956, "flos": 575737818624.0, "grad_norm": 0.03669424434563534, "language_loss": 0.86868215, "learning_rate": 0.0009690120449385555, "loss": 0.87801802, "num_input_tokens_seen": 60207104, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01777578, "step": 724, "time_per_iteration": 2.8498518466949463 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01093753, "balance_loss_mlp": 1.07543111, "diversity_loss_mlp": 0.0, "epoch": 0.13947672181608312, "flos": 563225674752.0, "grad_norm": 0.10366482624390064, "language_loss": 0.92449063, "learning_rate": 0.0009689039826650312, "loss": 0.93542814, "num_input_tokens_seen": 60277920, "router_z_loss_mlp": 0.18322754, "routerloss_mlp": 0.0, "step": 725, "time_per_iteration": 2.7611966133117676 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0154366, "balance_loss_mlp": 1.50932813, "diversity_loss_mlp": 0.0, "epoch": 0.13966910350134668, "flos": 1521546964992.0, "grad_norm": 0.08078369374569346, "language_loss": 0.76523066, "learning_rate": 0.000968795738346836, "loss": 0.78066719, "num_input_tokens_seen": 60494224, "router_z_loss_mlp": 0.34375, "routerloss_mlp": 0.0, "step": 726, "time_per_iteration": 4.927435398101807 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00933775, "balance_loss_mlp": 1.62253523, "diversity_loss_mlp": 0.20735951, "epoch": 0.13986148518661023, "flos": 499854053376.0, "grad_norm": 0.04309218151041253, "language_loss": 0.87429261, "learning_rate": 0.0009686873120259941, "loss": 0.88363039, "num_input_tokens_seen": 60562176, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01882811, "step": 727, "time_per_iteration": 2.602264165878296 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0113926, "balance_loss_mlp": 1.12035322, "diversity_loss_mlp": 0.0, "epoch": 0.1400538668718738, "flos": 598674488832.0, "grad_norm": 0.14876828859354083, "language_loss": 0.8713131, "learning_rate": 0.0009685787037446004, "loss": 0.88270569, "num_input_tokens_seen": 60631472, "router_z_loss_mlp": 0.18884277, "routerloss_mlp": 0.0, "step": 728, "time_per_iteration": 2.806549072265625 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118218, "balance_loss_mlp": 1.09903765, "diversity_loss_mlp": 0.0, "epoch": 0.14024624855713735, "flos": 594039287808.0, "grad_norm": 0.1987640778264907, "language_loss": 0.87505388, "learning_rate": 0.0009684699135448201, "loss": 0.88623607, "num_input_tokens_seen": 60703488, "router_z_loss_mlp": 0.19165039, "routerloss_mlp": 0.0, "step": 729, "time_per_iteration": 2.7200138568878174 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112792, "balance_loss_mlp": 1.09435034, "diversity_loss_mlp": 0.0, "epoch": 0.1404386302424009, "flos": 506584585728.0, "grad_norm": 0.0640895655048784, "language_loss": 0.92135447, "learning_rate": 0.0009683609414688895, "loss": 0.93248242, "num_input_tokens_seen": 60773936, "router_z_loss_mlp": 0.18432617, "routerloss_mlp": 0.0, "step": 730, "time_per_iteration": 2.7423696517944336 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00911127, "balance_loss_mlp": 1.58117688, "diversity_loss_mlp": 0.20959289, "epoch": 0.14063101192766447, "flos": 573407921664.0, "grad_norm": 0.03249579551243702, "language_loss": 0.86587501, "learning_rate": 0.0009682517875591154, "loss": 0.87498629, "num_input_tokens_seen": 60851120, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01574249, "step": 731, "time_per_iteration": 2.809400796890259 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01199938, "balance_loss_mlp": 1.18138909, "diversity_loss_mlp": 0.0, "epoch": 0.14082339361292806, "flos": 564619133952.0, "grad_norm": 0.07609394509363156, "language_loss": 0.86229968, "learning_rate": 0.0009681424518578749, "loss": 0.87429905, "num_input_tokens_seen": 60924896, "router_z_loss_mlp": 0.18530273, "routerloss_mlp": 0.0, "step": 732, "time_per_iteration": 2.725839614868164 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01283686, "balance_loss_mlp": 1.26505399, "diversity_loss_mlp": 0.0, "epoch": 0.14101577529819162, "flos": 463584798720.0, "grad_norm": 0.1414658743658329, "language_loss": 0.87506676, "learning_rate": 0.000968032934407616, "loss": 0.88790363, "num_input_tokens_seen": 60996016, "router_z_loss_mlp": 0.1862793, "routerloss_mlp": 0.0, "step": 733, "time_per_iteration": 2.583768844604492 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01310281, "balance_loss_mlp": 1.29136264, "diversity_loss_mlp": 0.0, "epoch": 0.14120815698345518, "flos": 596085060096.0, "grad_norm": 0.10963887531318486, "language_loss": 0.81871867, "learning_rate": 0.0009679232352508571, "loss": 0.8318215, "num_input_tokens_seen": 61072016, "router_z_loss_mlp": 0.18908691, "routerloss_mlp": 0.0, "step": 734, "time_per_iteration": 2.785585880279541 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01286635, "balance_loss_mlp": 1.26744211, "diversity_loss_mlp": 0.0, "epoch": 0.14140053866871874, "flos": 535137311232.0, "grad_norm": 0.10469043869015734, "language_loss": 0.80695581, "learning_rate": 0.0009678133544301871, "loss": 0.81982213, "num_input_tokens_seen": 61144528, "router_z_loss_mlp": 0.19165039, "routerloss_mlp": 0.0, "step": 735, "time_per_iteration": 2.6638481616973877 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01224375, "balance_loss_mlp": 1.20588589, "diversity_loss_mlp": 0.0, "epoch": 0.1415929203539823, "flos": 520265534976.0, "grad_norm": 0.06500438819618859, "language_loss": 0.91870093, "learning_rate": 0.0009677032919882658, "loss": 0.93094468, "num_input_tokens_seen": 61216960, "router_z_loss_mlp": 0.18493652, "routerloss_mlp": 0.0, "step": 736, "time_per_iteration": 2.6578378677368164 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01197974, "balance_loss_mlp": 1.18056929, "diversity_loss_mlp": 0.0, "epoch": 0.14178530203924586, "flos": 482335948800.0, "grad_norm": 0.09940630997209131, "language_loss": 0.91374373, "learning_rate": 0.000967593047967823, "loss": 0.92572349, "num_input_tokens_seen": 61281312, "router_z_loss_mlp": 0.17419434, "routerloss_mlp": 0.0, "step": 737, "time_per_iteration": 2.5236403942108154 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0117212, "balance_loss_mlp": 1.15476346, "diversity_loss_mlp": 0.0, "epoch": 0.14197768372450942, "flos": 676638904320.0, "grad_norm": 0.10840920786543624, "language_loss": 0.86479127, "learning_rate": 0.0009674826224116593, "loss": 0.87651253, "num_input_tokens_seen": 61355888, "router_z_loss_mlp": 0.17370605, "routerloss_mlp": 0.0, "step": 738, "time_per_iteration": 2.803260326385498 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01134605, "balance_loss_mlp": 1.11759412, "diversity_loss_mlp": 0.0, "epoch": 0.14217006540977298, "flos": 446039529984.0, "grad_norm": 0.09051392518082112, "language_loss": 0.86862409, "learning_rate": 0.0009673720153626455, "loss": 0.87997013, "num_input_tokens_seen": 61424288, "router_z_loss_mlp": 0.17028809, "routerloss_mlp": 0.0, "step": 739, "time_per_iteration": 2.6086573600769043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124987, "balance_loss_mlp": 1.10798764, "diversity_loss_mlp": 0.0, "epoch": 0.14236244709503657, "flos": 496503654912.0, "grad_norm": 0.11444093339414264, "language_loss": 0.8689152, "learning_rate": 0.0009672612268637235, "loss": 0.88016504, "num_input_tokens_seen": 61493344, "router_z_loss_mlp": 0.17016602, "routerloss_mlp": 0.0, "step": 740, "time_per_iteration": 2.582648277282715 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01116151, "balance_loss_mlp": 1.09880614, "diversity_loss_mlp": 0.0, "epoch": 0.14255482878030012, "flos": 648313403904.0, "grad_norm": 0.10874190594389947, "language_loss": 0.84213787, "learning_rate": 0.0009671502569579048, "loss": 0.85329938, "num_input_tokens_seen": 61565216, "router_z_loss_mlp": 0.17370605, "routerloss_mlp": 0.0, "step": 741, "time_per_iteration": 2.7945284843444824 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01132432, "balance_loss_mlp": 1.11539662, "diversity_loss_mlp": 0.0, "epoch": 0.14274721046556368, "flos": 536165153280.0, "grad_norm": 0.07140691777849974, "language_loss": 0.89503837, "learning_rate": 0.0009670391056882719, "loss": 0.90636265, "num_input_tokens_seen": 61640928, "router_z_loss_mlp": 0.17053223, "routerloss_mlp": 0.0, "step": 742, "time_per_iteration": 2.71687912940979 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01149228, "balance_loss_mlp": 1.13240731, "diversity_loss_mlp": 0.0, "epoch": 0.14293959215082724, "flos": 957057431040.0, "grad_norm": 0.08672376963732596, "language_loss": 0.88698781, "learning_rate": 0.0009669277730979776, "loss": 0.89848006, "num_input_tokens_seen": 61717552, "router_z_loss_mlp": 0.16833496, "routerloss_mlp": 0.0, "step": 743, "time_per_iteration": 3.2029030323028564 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01147207, "balance_loss_mlp": 1.13025546, "diversity_loss_mlp": 0.0, "epoch": 0.1431319738360908, "flos": 693089519616.0, "grad_norm": 0.09113342882689801, "language_loss": 0.85227454, "learning_rate": 0.0009668162592302449, "loss": 0.86374664, "num_input_tokens_seen": 61800016, "router_z_loss_mlp": 0.16955566, "routerloss_mlp": 0.0, "step": 744, "time_per_iteration": 2.899656057357788 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01165032, "balance_loss_mlp": 1.14748406, "diversity_loss_mlp": 0.0, "epoch": 0.14332435552135436, "flos": 565439574528.0, "grad_norm": 0.07780467137911447, "language_loss": 0.86560214, "learning_rate": 0.0009667045641283676, "loss": 0.87725246, "num_input_tokens_seen": 61865904, "router_z_loss_mlp": 0.17553711, "routerloss_mlp": 0.0, "step": 745, "time_per_iteration": 2.6474997997283936 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01159177, "balance_loss_mlp": 1.14148676, "diversity_loss_mlp": 0.0, "epoch": 0.14351673720661792, "flos": 738374787072.0, "grad_norm": 0.09864944110558675, "language_loss": 0.95312673, "learning_rate": 0.0009665926878357092, "loss": 0.96471858, "num_input_tokens_seen": 61945728, "router_z_loss_mlp": 0.17700195, "routerloss_mlp": 0.0, "step": 746, "time_per_iteration": 2.946307420730591 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00851982, "balance_loss_mlp": 1.46230698, "diversity_loss_mlp": 0.20995456, "epoch": 0.14370911889188148, "flos": 549230865408.0, "grad_norm": 0.034792990408202794, "language_loss": 0.91192698, "learning_rate": 0.0009664806303957043, "loss": 0.92044681, "num_input_tokens_seen": 62016288, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01585159, "step": 747, "time_per_iteration": 2.706286668777466 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01160661, "balance_loss_mlp": 1.14221931, "diversity_loss_mlp": 0.0, "epoch": 0.14390150057714507, "flos": 590295734784.0, "grad_norm": 0.08367194984434445, "language_loss": 0.87066692, "learning_rate": 0.0009663683918518571, "loss": 0.88227355, "num_input_tokens_seen": 62097904, "router_z_loss_mlp": 0.18444824, "routerloss_mlp": 0.0, "step": 748, "time_per_iteration": 2.892982244491577 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136526, "balance_loss_mlp": 1.11831081, "diversity_loss_mlp": 0.0, "epoch": 0.14409388226240863, "flos": 591047165952.0, "grad_norm": 0.07455761265115375, "language_loss": 0.85490787, "learning_rate": 0.0009662559722477428, "loss": 0.86627316, "num_input_tokens_seen": 62166736, "router_z_loss_mlp": 0.18237305, "routerloss_mlp": 0.0, "step": 749, "time_per_iteration": 2.6979615688323975 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01292346, "balance_loss_mlp": 1.2582047, "diversity_loss_mlp": 0.0, "epoch": 0.1442862639476722, "flos": 1511263401984.0, "grad_norm": 0.08640394257539531, "language_loss": 0.7616297, "learning_rate": 0.0009661433716270062, "loss": 0.77455318, "num_input_tokens_seen": 62402512, "router_z_loss_mlp": 0.34179688, "routerloss_mlp": 0.0, "step": 750, "time_per_iteration": 4.991304397583008 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128515, "balance_loss_mlp": 1.11068118, "diversity_loss_mlp": 0.0, "epoch": 0.14447864563293575, "flos": 496765384704.0, "grad_norm": 0.07866539193327844, "language_loss": 0.89197791, "learning_rate": 0.0009660305900333632, "loss": 0.90326303, "num_input_tokens_seen": 62473408, "router_z_loss_mlp": 0.17834473, "routerloss_mlp": 0.0, "step": 751, "time_per_iteration": 2.6706793308258057 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121937, "balance_loss_mlp": 1.1038413, "diversity_loss_mlp": 0.0, "epoch": 0.1446710273181993, "flos": 589678299648.0, "grad_norm": 0.10038132697844201, "language_loss": 0.82478833, "learning_rate": 0.0009659176275105992, "loss": 0.83600777, "num_input_tokens_seen": 62547440, "router_z_loss_mlp": 0.1809082, "routerloss_mlp": 0.0, "step": 752, "time_per_iteration": 2.697909355163574 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126212, "balance_loss_mlp": 1.10777032, "diversity_loss_mlp": 0.0, "epoch": 0.14486340900346287, "flos": 585818749440.0, "grad_norm": 0.10638604925915984, "language_loss": 0.85756153, "learning_rate": 0.0009658044841025701, "loss": 0.86882365, "num_input_tokens_seen": 62620224, "router_z_loss_mlp": 0.18444824, "routerloss_mlp": 0.0, "step": 753, "time_per_iteration": 2.7749171257019043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128331, "balance_loss_mlp": 1.1107595, "diversity_loss_mlp": 0.0, "epoch": 0.14505579068872643, "flos": 504672062976.0, "grad_norm": 0.09130861127340602, "language_loss": 0.81584072, "learning_rate": 0.0009656911598532021, "loss": 0.827124, "num_input_tokens_seen": 62690464, "router_z_loss_mlp": 0.17590332, "routerloss_mlp": 0.0, "step": 754, "time_per_iteration": 2.635702610015869 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136592, "balance_loss_mlp": 1.11914003, "diversity_loss_mlp": 0.0, "epoch": 0.14524817237399, "flos": 486815505408.0, "grad_norm": 0.06835454276473461, "language_loss": 0.90494555, "learning_rate": 0.0009655776548064917, "loss": 0.9163115, "num_input_tokens_seen": 62762240, "router_z_loss_mlp": 0.17456055, "routerloss_mlp": 0.0, "step": 755, "time_per_iteration": 2.6545748710632324 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135969, "balance_loss_mlp": 1.11902952, "diversity_loss_mlp": 0.0, "epoch": 0.14544055405925355, "flos": 728175287808.0, "grad_norm": 0.07886906074703284, "language_loss": 0.88367254, "learning_rate": 0.0009654639690065054, "loss": 0.89503217, "num_input_tokens_seen": 62839760, "router_z_loss_mlp": 0.16943359, "routerloss_mlp": 0.0, "step": 756, "time_per_iteration": 2.8773815631866455 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01150961, "balance_loss_mlp": 1.13343716, "diversity_loss_mlp": 0.0, "epoch": 0.14563293574451713, "flos": 593643935232.0, "grad_norm": 0.07604063018618923, "language_loss": 0.8823185, "learning_rate": 0.00096535010249738, "loss": 0.89382815, "num_input_tokens_seen": 62910336, "router_z_loss_mlp": 0.17529297, "routerloss_mlp": 0.0, "step": 757, "time_per_iteration": 2.7175021171569824 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00846707, "balance_loss_mlp": 1.45519352, "diversity_loss_mlp": 0.20419648, "epoch": 0.1458253174297807, "flos": 560478030336.0, "grad_norm": 0.03954501513556402, "language_loss": 0.82782531, "learning_rate": 0.0009652360553233224, "loss": 0.83629239, "num_input_tokens_seen": 62988160, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.017012, "step": 758, "time_per_iteration": 2.7434637546539307 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0115333, "balance_loss_mlp": 1.12624609, "diversity_loss_mlp": 0.0, "epoch": 0.14601769911504425, "flos": 1557855866880.0, "grad_norm": 0.03342191973393777, "language_loss": 0.73773748, "learning_rate": 0.0009651218275286093, "loss": 0.7492708, "num_input_tokens_seen": 63224704, "router_z_loss_mlp": 0.27148438, "routerloss_mlp": 0.0, "step": 759, "time_per_iteration": 4.910880088806152 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01188786, "balance_loss_mlp": 1.17063034, "diversity_loss_mlp": 0.0, "epoch": 0.1462100808003078, "flos": 866301516288.0, "grad_norm": 0.0638252555407819, "language_loss": 0.81659228, "learning_rate": 0.0009650074191575883, "loss": 0.82848012, "num_input_tokens_seen": 63312400, "router_z_loss_mlp": 0.18151855, "routerloss_mlp": 0.0, "step": 760, "time_per_iteration": 3.2028603553771973 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01213565, "balance_loss_mlp": 1.19484925, "diversity_loss_mlp": 0.0, "epoch": 0.14640246248557137, "flos": 522943796736.0, "grad_norm": 0.07046318146001718, "language_loss": 0.86031073, "learning_rate": 0.0009648928302546766, "loss": 0.87244636, "num_input_tokens_seen": 63387792, "router_z_loss_mlp": 0.18713379, "routerloss_mlp": 0.0, "step": 761, "time_per_iteration": 2.6812515258789062 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01243947, "balance_loss_mlp": 1.22551703, "diversity_loss_mlp": 0.0, "epoch": 0.14659484417083493, "flos": 1030544487936.0, "grad_norm": 0.0884537515073792, "language_loss": 0.85470825, "learning_rate": 0.0009647780608643613, "loss": 0.86714768, "num_input_tokens_seen": 63475632, "router_z_loss_mlp": 0.1842041, "routerloss_mlp": 0.0, "step": 762, "time_per_iteration": 3.3486785888671875 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.012302, "balance_loss_mlp": 1.21243811, "diversity_loss_mlp": 0.0, "epoch": 0.1467872258560985, "flos": 500671922688.0, "grad_norm": 0.12042495658723557, "language_loss": 0.874053, "learning_rate": 0.0009646631110312001, "loss": 0.88635492, "num_input_tokens_seen": 63546080, "router_z_loss_mlp": 0.17773438, "routerloss_mlp": 0.0, "step": 763, "time_per_iteration": 2.6648313999176025 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01172377, "balance_loss_mlp": 1.1544956, "diversity_loss_mlp": 0.0, "epoch": 0.14697960754136205, "flos": 547797758976.0, "grad_norm": 0.05916332097574664, "language_loss": 0.8841719, "learning_rate": 0.0009645479807998203, "loss": 0.89589572, "num_input_tokens_seen": 63622464, "router_z_loss_mlp": 0.17883301, "routerloss_mlp": 0.0, "step": 764, "time_per_iteration": 2.7347912788391113 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01147505, "balance_loss_mlp": 1.12983775, "diversity_loss_mlp": 0.0, "epoch": 0.14717198922662564, "flos": 517849003008.0, "grad_norm": 0.06985321722585584, "language_loss": 0.92467874, "learning_rate": 0.0009644326702149196, "loss": 0.93615377, "num_input_tokens_seen": 63694736, "router_z_loss_mlp": 0.17675781, "routerloss_mlp": 0.0, "step": 765, "time_per_iteration": 2.7316319942474365 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135962, "balance_loss_mlp": 1.11803293, "diversity_loss_mlp": 0.0, "epoch": 0.1473643709118892, "flos": 732024552960.0, "grad_norm": 0.09157028460957184, "language_loss": 0.84919345, "learning_rate": 0.0009643171793212653, "loss": 0.86055309, "num_input_tokens_seen": 63779072, "router_z_loss_mlp": 0.17944336, "routerloss_mlp": 0.0, "step": 766, "time_per_iteration": 3.116917610168457 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105537, "balance_loss_mlp": 1.08738184, "diversity_loss_mlp": 0.0, "epoch": 0.14755675259715276, "flos": 620538900480.0, "grad_norm": 0.08034801396880724, "language_loss": 0.89233959, "learning_rate": 0.0009642015081636952, "loss": 0.90339494, "num_input_tokens_seen": 63847472, "router_z_loss_mlp": 0.18164062, "routerloss_mlp": 0.0, "step": 767, "time_per_iteration": 2.705993175506592 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103513, "balance_loss_mlp": 1.08563185, "diversity_loss_mlp": 0.0, "epoch": 0.14774913428241632, "flos": 452219065344.0, "grad_norm": 0.09221888586765616, "language_loss": 0.88360566, "learning_rate": 0.0009640856567871166, "loss": 0.8946408, "num_input_tokens_seen": 63912496, "router_z_loss_mlp": 0.17895508, "routerloss_mlp": 0.0, "step": 768, "time_per_iteration": 2.5172243118286133 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108598, "balance_loss_mlp": 1.08981061, "diversity_loss_mlp": 0.0, "epoch": 0.14794151596767988, "flos": 837234869760.0, "grad_norm": 0.0844592716079577, "language_loss": 0.89047211, "learning_rate": 0.0009639696252365072, "loss": 0.9015581, "num_input_tokens_seen": 63990832, "router_z_loss_mlp": 0.18786621, "routerloss_mlp": 0.0, "step": 769, "time_per_iteration": 3.034848690032959 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105095, "balance_loss_mlp": 1.08673656, "diversity_loss_mlp": 0.0, "epoch": 0.14813389765294344, "flos": 686092114944.0, "grad_norm": 0.07095543604969227, "language_loss": 0.81996548, "learning_rate": 0.0009638534135569144, "loss": 0.83101642, "num_input_tokens_seen": 64067552, "router_z_loss_mlp": 0.18371582, "routerloss_mlp": 0.0, "step": 770, "time_per_iteration": 2.947564125061035 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106726, "balance_loss_mlp": 1.08859468, "diversity_loss_mlp": 0.0, "epoch": 0.148326279338207, "flos": 509887996416.0, "grad_norm": 0.08627707323979403, "language_loss": 0.9012745, "learning_rate": 0.0009637370217934554, "loss": 0.91234171, "num_input_tokens_seen": 64140336, "router_z_loss_mlp": 0.18139648, "routerloss_mlp": 0.0, "step": 771, "time_per_iteration": 2.6592423915863037 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111886, "balance_loss_mlp": 1.09355128, "diversity_loss_mlp": 0.0, "epoch": 0.14851866102347056, "flos": 588161129472.0, "grad_norm": 0.06345294765682771, "language_loss": 0.82981932, "learning_rate": 0.0009636204499913175, "loss": 0.84093815, "num_input_tokens_seen": 64223472, "router_z_loss_mlp": 0.18334961, "routerloss_mlp": 0.0, "step": 772, "time_per_iteration": 2.8836610317230225 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115362, "balance_loss_mlp": 1.09749293, "diversity_loss_mlp": 0.0, "epoch": 0.14871104270873411, "flos": 691026494976.0, "grad_norm": 0.06338786563117527, "language_loss": 0.87914705, "learning_rate": 0.0009635036981957581, "loss": 0.89030063, "num_input_tokens_seen": 64299872, "router_z_loss_mlp": 0.17883301, "routerloss_mlp": 0.0, "step": 773, "time_per_iteration": 2.885239601135254 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01132405, "balance_loss_mlp": 1.11417794, "diversity_loss_mlp": 0.0, "epoch": 0.1489034243939977, "flos": 655098264576.0, "grad_norm": 0.08623405645423676, "language_loss": 0.90735364, "learning_rate": 0.0009633867664521043, "loss": 0.91867769, "num_input_tokens_seen": 64377152, "router_z_loss_mlp": 0.18212891, "routerloss_mlp": 0.0, "step": 774, "time_per_iteration": 2.802264451980591 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01159356, "balance_loss_mlp": 1.14176083, "diversity_loss_mlp": 0.0, "epoch": 0.14909580607926126, "flos": 475835212800.0, "grad_norm": 0.09977443827883303, "language_loss": 0.86760318, "learning_rate": 0.0009632696548057527, "loss": 0.8791967, "num_input_tokens_seen": 64443008, "router_z_loss_mlp": 0.17614746, "routerloss_mlp": 0.0, "step": 775, "time_per_iteration": 2.5641794204711914 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01187156, "balance_loss_mlp": 1.16960835, "diversity_loss_mlp": 0.0, "epoch": 0.14928818776452482, "flos": 611087887872.0, "grad_norm": 0.08744626586779954, "language_loss": 0.85013115, "learning_rate": 0.0009631523633021704, "loss": 0.86200273, "num_input_tokens_seen": 64519776, "router_z_loss_mlp": 0.17565918, "routerloss_mlp": 0.0, "step": 776, "time_per_iteration": 2.7851786613464355 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00881631, "balance_loss_mlp": 1.52411294, "diversity_loss_mlp": 0.20632464, "epoch": 0.14948056944978838, "flos": 561772744704.0, "grad_norm": 0.038364140445948956, "language_loss": 0.88378215, "learning_rate": 0.0009630348919868936, "loss": 0.89259851, "num_input_tokens_seen": 64593712, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0164127, "step": 777, "time_per_iteration": 2.7285845279693604 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01191902, "balance_loss_mlp": 1.17415154, "diversity_loss_mlp": 0.0, "epoch": 0.14967295113505194, "flos": 449199779328.0, "grad_norm": 0.14061909589017782, "language_loss": 0.81450796, "learning_rate": 0.0009629172409055293, "loss": 0.82642698, "num_input_tokens_seen": 64658448, "router_z_loss_mlp": 0.1776123, "routerloss_mlp": 0.0, "step": 778, "time_per_iteration": 2.5018203258514404 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01154162, "balance_loss_mlp": 1.13728166, "diversity_loss_mlp": 0.0, "epoch": 0.1498653328203155, "flos": 571285426176.0, "grad_norm": 0.06968828956123203, "language_loss": 0.87518388, "learning_rate": 0.0009627994101037531, "loss": 0.88672549, "num_input_tokens_seen": 64734144, "router_z_loss_mlp": 0.16894531, "routerloss_mlp": 0.0, "step": 779, "time_per_iteration": 2.763136863708496 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01139257, "balance_loss_mlp": 1.12231779, "diversity_loss_mlp": 0.0, "epoch": 0.15005771450557906, "flos": 631215244800.0, "grad_norm": 0.07833298109740298, "language_loss": 0.88761836, "learning_rate": 0.0009626813996273114, "loss": 0.8990109, "num_input_tokens_seen": 64813456, "router_z_loss_mlp": 0.16943359, "routerloss_mlp": 0.0, "step": 780, "time_per_iteration": 2.8791675567626953 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117275, "balance_loss_mlp": 1.09990597, "diversity_loss_mlp": 0.0, "epoch": 0.15025009619084262, "flos": 577939235328.0, "grad_norm": 0.09603506751758703, "language_loss": 0.89051467, "learning_rate": 0.0009625632095220198, "loss": 0.90168738, "num_input_tokens_seen": 64896816, "router_z_loss_mlp": 0.17370605, "routerloss_mlp": 0.0, "step": 781, "time_per_iteration": 2.8194801807403564 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119786, "balance_loss_mlp": 1.10251248, "diversity_loss_mlp": 0.0, "epoch": 0.1504424778761062, "flos": 483887623680.0, "grad_norm": 0.1003760880169841, "language_loss": 0.86904705, "learning_rate": 0.0009624448398337637, "loss": 0.88024497, "num_input_tokens_seen": 64964176, "router_z_loss_mlp": 0.17297363, "routerloss_mlp": 0.0, "step": 782, "time_per_iteration": 2.511925458908081 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117445, "balance_loss_mlp": 1.10021877, "diversity_loss_mlp": 0.0, "epoch": 0.15063485956136977, "flos": 762512196096.0, "grad_norm": 0.08409428795596587, "language_loss": 0.8913728, "learning_rate": 0.0009623262906084984, "loss": 0.90254724, "num_input_tokens_seen": 65042592, "router_z_loss_mlp": 0.17236328, "routerloss_mlp": 0.0, "step": 783, "time_per_iteration": 2.9890754222869873 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01125496, "balance_loss_mlp": 1.10804367, "diversity_loss_mlp": 0.0, "epoch": 0.15082724124663333, "flos": 497630241792.0, "grad_norm": 0.07818041002140835, "language_loss": 0.90351313, "learning_rate": 0.0009622075618922486, "loss": 0.9147681, "num_input_tokens_seen": 65114576, "router_z_loss_mlp": 0.17480469, "routerloss_mlp": 0.0, "step": 784, "time_per_iteration": 2.6550891399383545 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119712, "balance_loss_mlp": 1.10261774, "diversity_loss_mlp": 0.0, "epoch": 0.15101962293189689, "flos": 509725011456.0, "grad_norm": 0.07239943737193227, "language_loss": 0.87125635, "learning_rate": 0.0009620886537311091, "loss": 0.88245344, "num_input_tokens_seen": 65186640, "router_z_loss_mlp": 0.17114258, "routerloss_mlp": 0.0, "step": 785, "time_per_iteration": 2.646864652633667 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01125571, "balance_loss_mlp": 1.10794032, "diversity_loss_mlp": 0.0, "epoch": 0.15121200461716044, "flos": 457756199424.0, "grad_norm": 0.08980079735835493, "language_loss": 0.85309643, "learning_rate": 0.000961969566171244, "loss": 0.86435217, "num_input_tokens_seen": 65252112, "router_z_loss_mlp": 0.1763916, "routerloss_mlp": 0.0, "step": 786, "time_per_iteration": 2.5803041458129883 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136873, "balance_loss_mlp": 1.11938524, "diversity_loss_mlp": 0.0, "epoch": 0.151404386302424, "flos": 537986271744.0, "grad_norm": 0.08282756535064502, "language_loss": 0.8993417, "learning_rate": 0.0009618502992588873, "loss": 0.91071045, "num_input_tokens_seen": 65318912, "router_z_loss_mlp": 0.17504883, "routerloss_mlp": 0.0, "step": 787, "time_per_iteration": 2.6479151248931885 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124837, "balance_loss_mlp": 1.10727715, "diversity_loss_mlp": 0.0, "epoch": 0.15159676798768756, "flos": 688209467904.0, "grad_norm": 0.07571751270322945, "language_loss": 0.8792628, "learning_rate": 0.0009617308530403424, "loss": 0.89051116, "num_input_tokens_seen": 65395424, "router_z_loss_mlp": 0.17565918, "routerloss_mlp": 0.0, "step": 788, "time_per_iteration": 3.002804756164551 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01125535, "balance_loss_mlp": 1.10758173, "diversity_loss_mlp": 0.0, "epoch": 0.15178914967295112, "flos": 545319558144.0, "grad_norm": 0.0842913885359751, "language_loss": 0.88032806, "learning_rate": 0.0009616112275619825, "loss": 0.89158338, "num_input_tokens_seen": 65470480, "router_z_loss_mlp": 0.1796875, "routerloss_mlp": 0.0, "step": 789, "time_per_iteration": 2.6842775344848633 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110837, "balance_loss_mlp": 1.09398067, "diversity_loss_mlp": 0.0, "epoch": 0.1519815313582147, "flos": 511770783744.0, "grad_norm": 0.07451962795351484, "language_loss": 0.83893597, "learning_rate": 0.0009614914228702503, "loss": 0.85004437, "num_input_tokens_seen": 65544720, "router_z_loss_mlp": 0.1685791, "routerloss_mlp": 0.0, "step": 790, "time_per_iteration": 2.714026689529419 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095726, "balance_loss_mlp": 1.07848811, "diversity_loss_mlp": 0.0, "epoch": 0.15217391304347827, "flos": 684088187904.0, "grad_norm": 0.07099161447381937, "language_loss": 0.89133644, "learning_rate": 0.0009613714390116581, "loss": 0.90229368, "num_input_tokens_seen": 65627872, "router_z_loss_mlp": 0.17260742, "routerloss_mlp": 0.0, "step": 791, "time_per_iteration": 2.947917938232422 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089669, "balance_loss_mlp": 1.0730865, "diversity_loss_mlp": 0.0, "epoch": 0.15236629472874183, "flos": 644186981376.0, "grad_norm": 0.07518738092336623, "language_loss": 0.86102855, "learning_rate": 0.0009612512760327879, "loss": 0.87192523, "num_input_tokens_seen": 65705264, "router_z_loss_mlp": 0.16589355, "routerloss_mlp": 0.0, "step": 792, "time_per_iteration": 2.887404203414917 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092437, "balance_loss_mlp": 1.07553315, "diversity_loss_mlp": 0.0, "epoch": 0.1525586764140054, "flos": 412876196352.0, "grad_norm": 0.09992337759040973, "language_loss": 0.85428631, "learning_rate": 0.0009611309339802909, "loss": 0.86521071, "num_input_tokens_seen": 65768592, "router_z_loss_mlp": 0.16918945, "routerloss_mlp": 0.0, "step": 793, "time_per_iteration": 2.463308811187744 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101517, "balance_loss_mlp": 1.08537626, "diversity_loss_mlp": 0.0, "epoch": 0.15275105809926895, "flos": 802801414656.0, "grad_norm": 0.07717151134226699, "language_loss": 0.84535038, "learning_rate": 0.0009610104129008881, "loss": 0.85636556, "num_input_tokens_seen": 65852432, "router_z_loss_mlp": 0.16137695, "routerloss_mlp": 0.0, "step": 794, "time_per_iteration": 3.1276698112487793 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108014, "balance_loss_mlp": 1.09176612, "diversity_loss_mlp": 0.0, "epoch": 0.1529434397845325, "flos": 612422249472.0, "grad_norm": 0.07067272187318202, "language_loss": 0.88475168, "learning_rate": 0.0009608897128413701, "loss": 0.89583182, "num_input_tokens_seen": 65927904, "router_z_loss_mlp": 0.16247559, "routerloss_mlp": 0.0, "step": 795, "time_per_iteration": 2.7658157348632812 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110863, "balance_loss_mlp": 1.09251332, "diversity_loss_mlp": 0.0, "epoch": 0.15313582146979607, "flos": 615246243840.0, "grad_norm": 0.05987412473430484, "language_loss": 0.85522842, "learning_rate": 0.0009607688338485965, "loss": 0.86631477, "num_input_tokens_seen": 66006800, "router_z_loss_mlp": 0.16113281, "routerloss_mlp": 0.0, "step": 796, "time_per_iteration": 2.849942207336426 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112598, "balance_loss_mlp": 1.10935068, "diversity_loss_mlp": 0.0, "epoch": 0.15332820315505963, "flos": 793602593280.0, "grad_norm": 0.07148533051381147, "language_loss": 0.90245026, "learning_rate": 0.0009606477759694969, "loss": 0.91371006, "num_input_tokens_seen": 66088608, "router_z_loss_mlp": 0.16638184, "routerloss_mlp": 0.0, "step": 797, "time_per_iteration": 3.0240113735198975 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01144466, "balance_loss_mlp": 1.12839675, "diversity_loss_mlp": 0.0, "epoch": 0.1535205848403232, "flos": 550206950400.0, "grad_norm": 0.07535837127697287, "language_loss": 0.87540114, "learning_rate": 0.0009605265392510703, "loss": 0.88684577, "num_input_tokens_seen": 66153616, "router_z_loss_mlp": 0.16064453, "routerloss_mlp": 0.0, "step": 798, "time_per_iteration": 2.6324868202209473 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01147656, "balance_loss_mlp": 1.13140786, "diversity_loss_mlp": 0.0, "epoch": 0.15371296652558677, "flos": 535947840000.0, "grad_norm": 0.070317951825601, "language_loss": 0.91919398, "learning_rate": 0.0009604051237403846, "loss": 0.93067056, "num_input_tokens_seen": 66219472, "router_z_loss_mlp": 0.16247559, "routerloss_mlp": 0.0, "step": 799, "time_per_iteration": 2.6472957134246826 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01159957, "balance_loss_mlp": 1.14441192, "diversity_loss_mlp": 0.0, "epoch": 0.15390534821085033, "flos": 395219699712.0, "grad_norm": 0.08825283549053219, "language_loss": 0.8626982, "learning_rate": 0.0009602835294845776, "loss": 0.8742978, "num_input_tokens_seen": 66281456, "router_z_loss_mlp": 0.15527344, "routerloss_mlp": 0.0, "step": 800, "time_per_iteration": 2.4501516819000244 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01141823, "balance_loss_mlp": 1.12552738, "diversity_loss_mlp": 0.0, "epoch": 0.1540977298961139, "flos": 535846523904.0, "grad_norm": 0.07489761537063061, "language_loss": 0.89964634, "learning_rate": 0.0009601617565308565, "loss": 0.91106457, "num_input_tokens_seen": 66348160, "router_z_loss_mlp": 0.16296387, "routerloss_mlp": 0.0, "step": 801, "time_per_iteration": 2.6480391025543213 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00945745, "balance_loss_mlp": 1.65525413, "diversity_loss_mlp": 0.20237769, "epoch": 0.15429011158137745, "flos": 723727664640.0, "grad_norm": 0.03656221347615257, "language_loss": 0.8655234, "learning_rate": 0.0009600398049264977, "loss": 0.87498081, "num_input_tokens_seen": 66430576, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01692954, "step": 802, "time_per_iteration": 3.0029048919677734 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00923116, "balance_loss_mlp": 1.61011553, "diversity_loss_mlp": 0.20312682, "epoch": 0.154482493266641, "flos": 620516505600.0, "grad_norm": 0.045238735441598905, "language_loss": 0.92041564, "learning_rate": 0.0009599176747188469, "loss": 0.92964679, "num_input_tokens_seen": 66506480, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0164945, "step": 803, "time_per_iteration": 2.860461473464966 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0113914, "balance_loss_mlp": 1.12246239, "diversity_loss_mlp": 0.0, "epoch": 0.15467487495190457, "flos": 525624629760.0, "grad_norm": 0.08350523706559901, "language_loss": 0.83155477, "learning_rate": 0.0009597953659553196, "loss": 0.84294617, "num_input_tokens_seen": 66577680, "router_z_loss_mlp": 0.16687012, "routerloss_mlp": 0.0, "step": 804, "time_per_iteration": 2.733302116394043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01139673, "balance_loss_mlp": 1.12363935, "diversity_loss_mlp": 0.0, "epoch": 0.15486725663716813, "flos": 527729872896.0, "grad_norm": 0.08094420015679657, "language_loss": 0.89484847, "learning_rate": 0.0009596728786833997, "loss": 0.90624517, "num_input_tokens_seen": 66648496, "router_z_loss_mlp": 0.16027832, "routerloss_mlp": 0.0, "step": 805, "time_per_iteration": 2.602963447570801 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112483, "balance_loss_mlp": 1.10851073, "diversity_loss_mlp": 0.0, "epoch": 0.1550596383224317, "flos": 1048549349376.0, "grad_norm": 0.09295267358895155, "language_loss": 0.8926357, "learning_rate": 0.0009595502129506415, "loss": 0.90388405, "num_input_tokens_seen": 66735216, "router_z_loss_mlp": 0.16320801, "routerloss_mlp": 0.0, "step": 806, "time_per_iteration": 3.358494997024536 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112525, "balance_loss_mlp": 1.10893035, "diversity_loss_mlp": 0.0, "epoch": 0.15525202000769528, "flos": 613716963840.0, "grad_norm": 0.09807919542340894, "language_loss": 0.82600027, "learning_rate": 0.0009594273688046678, "loss": 0.83725274, "num_input_tokens_seen": 66810672, "router_z_loss_mlp": 0.16320801, "routerloss_mlp": 0.0, "step": 807, "time_per_iteration": 2.7516088485717773 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121041, "balance_loss_mlp": 1.10408974, "diversity_loss_mlp": 0.0, "epoch": 0.15544440169295884, "flos": 533064374784.0, "grad_norm": 0.13657059547118527, "language_loss": 0.85685933, "learning_rate": 0.000959304346293171, "loss": 0.86806977, "num_input_tokens_seen": 66879824, "router_z_loss_mlp": 0.16955566, "routerloss_mlp": 0.0, "step": 808, "time_per_iteration": 2.676118850708008 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01133717, "balance_loss_mlp": 1.11686087, "diversity_loss_mlp": 0.0, "epoch": 0.1556367833782224, "flos": 644723297280.0, "grad_norm": 0.08670416080232539, "language_loss": 0.88104093, "learning_rate": 0.0009591811454639125, "loss": 0.89237815, "num_input_tokens_seen": 66949424, "router_z_loss_mlp": 0.16870117, "routerloss_mlp": 0.0, "step": 809, "time_per_iteration": 2.806877613067627 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01143795, "balance_loss_mlp": 1.12712979, "diversity_loss_mlp": 0.0, "epoch": 0.15582916506348596, "flos": 543822211584.0, "grad_norm": 0.07575766208840308, "language_loss": 0.88623202, "learning_rate": 0.0009590577663647234, "loss": 0.89766991, "num_input_tokens_seen": 67024000, "router_z_loss_mlp": 0.16662598, "routerloss_mlp": 0.0, "step": 810, "time_per_iteration": 2.705397605895996 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01167139, "balance_loss_mlp": 1.15012765, "diversity_loss_mlp": 0.0, "epoch": 0.15602154674874952, "flos": 580034566656.0, "grad_norm": 0.07966338850805216, "language_loss": 0.86178398, "learning_rate": 0.0009589342090435036, "loss": 0.87345541, "num_input_tokens_seen": 67100672, "router_z_loss_mlp": 0.17028809, "routerloss_mlp": 0.0, "step": 811, "time_per_iteration": 2.767648935317993 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01164589, "balance_loss_mlp": 1.14749408, "diversity_loss_mlp": 0.0, "epoch": 0.15621392843401308, "flos": 535248539136.0, "grad_norm": 0.07988119295983553, "language_loss": 0.87430739, "learning_rate": 0.0009588104735482223, "loss": 0.88595331, "num_input_tokens_seen": 67171584, "router_z_loss_mlp": 0.17102051, "routerloss_mlp": 0.0, "step": 812, "time_per_iteration": 2.6543996334075928 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01167363, "balance_loss_mlp": 1.14989901, "diversity_loss_mlp": 0.0, "epoch": 0.15640631011927664, "flos": 550903680000.0, "grad_norm": 0.09429144108453459, "language_loss": 0.83906114, "learning_rate": 0.0009586865599269177, "loss": 0.85073483, "num_input_tokens_seen": 67240640, "router_z_loss_mlp": 0.17480469, "routerloss_mlp": 0.0, "step": 813, "time_per_iteration": 2.632206439971924 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01180179, "balance_loss_mlp": 1.1632992, "diversity_loss_mlp": 0.0, "epoch": 0.1565986918045402, "flos": 637478843904.0, "grad_norm": 0.08748302318090055, "language_loss": 0.88416874, "learning_rate": 0.0009585624682276977, "loss": 0.89597052, "num_input_tokens_seen": 67312976, "router_z_loss_mlp": 0.16894531, "routerloss_mlp": 0.0, "step": 814, "time_per_iteration": 2.7365036010742188 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01187488, "balance_loss_mlp": 1.17066741, "diversity_loss_mlp": 0.0, "epoch": 0.15679107348980378, "flos": 490810876416.0, "grad_norm": 0.08109713122840453, "language_loss": 0.87263978, "learning_rate": 0.0009584381984987386, "loss": 0.88451469, "num_input_tokens_seen": 67378528, "router_z_loss_mlp": 0.16833496, "routerloss_mlp": 0.0, "step": 815, "time_per_iteration": 2.5354831218719482 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.011941, "balance_loss_mlp": 1.1770407, "diversity_loss_mlp": 0.0, "epoch": 0.15698345517506734, "flos": 529951113216.0, "grad_norm": 0.07928759805262754, "language_loss": 0.89978456, "learning_rate": 0.0009583137507882864, "loss": 0.91172552, "num_input_tokens_seen": 67449728, "router_z_loss_mlp": 0.17077637, "routerloss_mlp": 0.0, "step": 816, "time_per_iteration": 2.679156541824341 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00895961, "balance_loss_mlp": 1.55854249, "diversity_loss_mlp": 0.20119007, "epoch": 0.1571758368603309, "flos": 546038682624.0, "grad_norm": 0.035733799703693336, "language_loss": 0.81236839, "learning_rate": 0.000958189125144656, "loss": 0.82132804, "num_input_tokens_seen": 67520512, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0160944, "step": 817, "time_per_iteration": 2.6629080772399902 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01211679, "balance_loss_mlp": 1.1954186, "diversity_loss_mlp": 0.0, "epoch": 0.15736821854559446, "flos": 565649547264.0, "grad_norm": 0.08655764528844483, "language_loss": 0.88309336, "learning_rate": 0.0009580643216162313, "loss": 0.89521015, "num_input_tokens_seen": 67592464, "router_z_loss_mlp": 0.16259766, "routerloss_mlp": 0.0, "step": 818, "time_per_iteration": 2.6631743907928467 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01174608, "balance_loss_mlp": 1.15813375, "diversity_loss_mlp": 0.0, "epoch": 0.15756060023085802, "flos": 500956047360.0, "grad_norm": 0.07543766685957613, "language_loss": 0.79610753, "learning_rate": 0.0009579393402514652, "loss": 0.80785358, "num_input_tokens_seen": 67658928, "router_z_loss_mlp": 0.16479492, "routerloss_mlp": 0.0, "step": 819, "time_per_iteration": 2.5706892013549805 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0116637, "balance_loss_mlp": 1.15002656, "diversity_loss_mlp": 0.0, "epoch": 0.15775298191612158, "flos": 519264857088.0, "grad_norm": 0.08555828674018097, "language_loss": 0.90543056, "learning_rate": 0.0009578141810988801, "loss": 0.91709423, "num_input_tokens_seen": 67727936, "router_z_loss_mlp": 0.16345215, "routerloss_mlp": 0.0, "step": 820, "time_per_iteration": 2.6443581581115723 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01154879, "balance_loss_mlp": 1.13852358, "diversity_loss_mlp": 0.0, "epoch": 0.15794536360138514, "flos": 466129810944.0, "grad_norm": 0.08457683432578478, "language_loss": 0.90617025, "learning_rate": 0.0009576888442070668, "loss": 0.91771901, "num_input_tokens_seen": 67795488, "router_z_loss_mlp": 0.16357422, "routerloss_mlp": 0.0, "step": 821, "time_per_iteration": 2.588172197341919 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01131243, "balance_loss_mlp": 1.11597228, "diversity_loss_mlp": 0.0, "epoch": 0.1581377452866487, "flos": 517162185216.0, "grad_norm": 0.08246293521158644, "language_loss": 0.92183721, "learning_rate": 0.0009575633296246854, "loss": 0.93314958, "num_input_tokens_seen": 67858896, "router_z_loss_mlp": 0.15246582, "routerloss_mlp": 0.0, "step": 822, "time_per_iteration": 2.5674116611480713 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00894902, "balance_loss_mlp": 1.55344844, "diversity_loss_mlp": 0.20225295, "epoch": 0.15833012697191226, "flos": 549784433664.0, "grad_norm": 0.035537794180972825, "language_loss": 0.83368647, "learning_rate": 0.0009574376374004652, "loss": 0.84263551, "num_input_tokens_seen": 67924864, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01705186, "step": 823, "time_per_iteration": 2.6215808391571045 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124038, "balance_loss_mlp": 1.10815978, "diversity_loss_mlp": 0.0, "epoch": 0.15852250865717585, "flos": 487457906688.0, "grad_norm": 0.07732147283422666, "language_loss": 0.801727, "learning_rate": 0.000957311767583204, "loss": 0.81296742, "num_input_tokens_seen": 67992912, "router_z_loss_mlp": 0.15869141, "routerloss_mlp": 0.0, "step": 824, "time_per_iteration": 2.6025402545928955 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0114811, "balance_loss_mlp": 1.12617576, "diversity_loss_mlp": 0.0, "epoch": 0.1587148903424394, "flos": 1309770694656.0, "grad_norm": 0.06675818035974217, "language_loss": 0.8207159, "learning_rate": 0.0009571857202217691, "loss": 0.83219701, "num_input_tokens_seen": 68207408, "router_z_loss_mlp": 0.21972656, "routerloss_mlp": 0.0, "step": 825, "time_per_iteration": 4.730658531188965 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00883043, "balance_loss_mlp": 1.5295732, "diversity_loss_mlp": 0.20110103, "epoch": 0.15890727202770297, "flos": 466873528320.0, "grad_norm": 0.0472865977200058, "language_loss": 0.91635585, "learning_rate": 0.0009570594953650961, "loss": 0.92518628, "num_input_tokens_seen": 68270864, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01770616, "step": 826, "time_per_iteration": 2.528219699859619 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119191, "balance_loss_mlp": 1.10247803, "diversity_loss_mlp": 0.0, "epoch": 0.15909965371296653, "flos": 777107188224.0, "grad_norm": 0.1137923923451387, "language_loss": 0.80430406, "learning_rate": 0.00095693309306219, "loss": 0.81549597, "num_input_tokens_seen": 68355408, "router_z_loss_mlp": 0.16723633, "routerloss_mlp": 0.0, "step": 827, "time_per_iteration": 3.0950989723205566 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111184, "balance_loss_mlp": 1.09513879, "diversity_loss_mlp": 0.0, "epoch": 0.1592920353982301, "flos": 1078273451520.0, "grad_norm": 0.08215179220405018, "language_loss": 0.87886679, "learning_rate": 0.0009568065133621244, "loss": 0.8899852, "num_input_tokens_seen": 68437072, "router_z_loss_mlp": 0.16699219, "routerloss_mlp": 0.0, "step": 828, "time_per_iteration": 3.367777109146118 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106235, "balance_loss_mlp": 1.08993912, "diversity_loss_mlp": 0.0, "epoch": 0.15948441708349365, "flos": 725622935040.0, "grad_norm": 0.0806870261134831, "language_loss": 0.85100621, "learning_rate": 0.0009566797563140422, "loss": 0.86206853, "num_input_tokens_seen": 68511696, "router_z_loss_mlp": 0.16296387, "routerloss_mlp": 0.0, "step": 829, "time_per_iteration": 2.8803212642669678 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01122437, "balance_loss_mlp": 1.10618925, "diversity_loss_mlp": 0.0, "epoch": 0.1596767987687572, "flos": 578771785728.0, "grad_norm": 0.0881590388408274, "language_loss": 0.88045579, "learning_rate": 0.0009565528219671547, "loss": 0.89168018, "num_input_tokens_seen": 68587488, "router_z_loss_mlp": 0.16247559, "routerloss_mlp": 0.0, "step": 830, "time_per_iteration": 2.8965914249420166 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130205, "balance_loss_mlp": 1.11437368, "diversity_loss_mlp": 0.0, "epoch": 0.15986918045402077, "flos": 528987511296.0, "grad_norm": 0.08433678519740714, "language_loss": 0.84820044, "learning_rate": 0.0009564257103707418, "loss": 0.85950249, "num_input_tokens_seen": 68655760, "router_z_loss_mlp": 0.15820312, "routerloss_mlp": 0.0, "step": 831, "time_per_iteration": 2.6071205139160156 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138047, "balance_loss_mlp": 1.12237096, "diversity_loss_mlp": 0.0, "epoch": 0.16006156213928435, "flos": 574584067584.0, "grad_norm": 0.08192391736137887, "language_loss": 0.90990019, "learning_rate": 0.0009562984215741533, "loss": 0.92128068, "num_input_tokens_seen": 68724560, "router_z_loss_mlp": 0.15661621, "routerloss_mlp": 0.0, "step": 832, "time_per_iteration": 2.647022008895874 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126204, "balance_loss_mlp": 1.11050415, "diversity_loss_mlp": 0.0, "epoch": 0.1602539438245479, "flos": 515541127680.0, "grad_norm": 0.08304692865674389, "language_loss": 0.8233614, "learning_rate": 0.0009561709556268065, "loss": 0.83462346, "num_input_tokens_seen": 68795440, "router_z_loss_mlp": 0.15686035, "routerloss_mlp": 0.0, "step": 833, "time_per_iteration": 2.7033326625823975 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113334, "balance_loss_mlp": 1.09758639, "diversity_loss_mlp": 0.0, "epoch": 0.16044632550981147, "flos": 621015745536.0, "grad_norm": 0.1118379895427605, "language_loss": 0.94022137, "learning_rate": 0.0009560433125781884, "loss": 0.95135468, "num_input_tokens_seen": 68868176, "router_z_loss_mlp": 0.15734863, "routerloss_mlp": 0.0, "step": 834, "time_per_iteration": 2.7286314964294434 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01137088, "balance_loss_mlp": 1.12088716, "diversity_loss_mlp": 0.0, "epoch": 0.16063870719507503, "flos": 561078586368.0, "grad_norm": 0.07457680689162895, "language_loss": 0.92389894, "learning_rate": 0.0009559154924778544, "loss": 0.93526971, "num_input_tokens_seen": 68939616, "router_z_loss_mlp": 0.1619873, "routerloss_mlp": 0.0, "step": 835, "time_per_iteration": 2.7348785400390625 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01143876, "balance_loss_mlp": 1.12812805, "diversity_loss_mlp": 0.0, "epoch": 0.1608310888803386, "flos": 805133882880.0, "grad_norm": 0.10043267780752475, "language_loss": 0.85037422, "learning_rate": 0.0009557874953754284, "loss": 0.86181295, "num_input_tokens_seen": 69016192, "router_z_loss_mlp": 0.15734863, "routerloss_mlp": 0.0, "step": 836, "time_per_iteration": 3.069246768951416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01156501, "balance_loss_mlp": 1.14049125, "diversity_loss_mlp": 0.0, "epoch": 0.16102347056560215, "flos": 600587011584.0, "grad_norm": 0.08327927090533828, "language_loss": 0.83506572, "learning_rate": 0.0009556593213206038, "loss": 0.84663069, "num_input_tokens_seen": 69089360, "router_z_loss_mlp": 0.16003418, "routerloss_mlp": 0.0, "step": 837, "time_per_iteration": 2.7368414402008057 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01190738, "balance_loss_mlp": 1.17505026, "diversity_loss_mlp": 0.0, "epoch": 0.1612158522508657, "flos": 553510361088.0, "grad_norm": 0.08045457133261572, "language_loss": 0.87076676, "learning_rate": 0.0009555309703631414, "loss": 0.88267422, "num_input_tokens_seen": 69161952, "router_z_loss_mlp": 0.15673828, "routerloss_mlp": 0.0, "step": 838, "time_per_iteration": 2.72027850151062 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01180132, "balance_loss_mlp": 1.16382456, "diversity_loss_mlp": 0.0, "epoch": 0.16140823393612927, "flos": 555963969024.0, "grad_norm": 0.09367634959673259, "language_loss": 0.87476748, "learning_rate": 0.0009554024425528722, "loss": 0.88656878, "num_input_tokens_seen": 69232432, "router_z_loss_mlp": 0.16308594, "routerloss_mlp": 0.0, "step": 839, "time_per_iteration": 2.7314722537994385 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01173375, "balance_loss_mlp": 1.15756762, "diversity_loss_mlp": 0.0, "epoch": 0.16160061562139286, "flos": 543871770624.0, "grad_norm": 0.0683151622017414, "language_loss": 0.88983327, "learning_rate": 0.0009552737379396948, "loss": 0.90156698, "num_input_tokens_seen": 69297696, "router_z_loss_mlp": 0.15795898, "routerloss_mlp": 0.0, "step": 840, "time_per_iteration": 2.6384117603302 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01165757, "balance_loss_mlp": 1.14950919, "diversity_loss_mlp": 0.0, "epoch": 0.16179299730665642, "flos": 603873169920.0, "grad_norm": 0.08203724053437887, "language_loss": 0.87545735, "learning_rate": 0.0009551448565735767, "loss": 0.88711488, "num_input_tokens_seen": 69373888, "router_z_loss_mlp": 0.16247559, "routerloss_mlp": 0.0, "step": 841, "time_per_iteration": 2.7497382164001465 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01158402, "balance_loss_mlp": 1.14156926, "diversity_loss_mlp": 0.0, "epoch": 0.16198537899191998, "flos": 787166097408.0, "grad_norm": 0.08523302245909381, "language_loss": 0.84374112, "learning_rate": 0.0009550157985045543, "loss": 0.8553251, "num_input_tokens_seen": 69449984, "router_z_loss_mlp": 0.16845703, "routerloss_mlp": 0.0, "step": 842, "time_per_iteration": 3.080169916152954 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0114708, "balance_loss_mlp": 1.13046193, "diversity_loss_mlp": 0.0, "epoch": 0.16217776067718354, "flos": 519805942272.0, "grad_norm": 0.10255895710786052, "language_loss": 0.89356017, "learning_rate": 0.0009548865637827321, "loss": 0.90503097, "num_input_tokens_seen": 69522736, "router_z_loss_mlp": 0.16625977, "routerloss_mlp": 0.0, "step": 843, "time_per_iteration": 2.684195041656494 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01158581, "balance_loss_mlp": 1.14129627, "diversity_loss_mlp": 0.0, "epoch": 0.1623701423624471, "flos": 505262707200.0, "grad_norm": 0.08376364289368579, "language_loss": 0.89409387, "learning_rate": 0.0009547571524582838, "loss": 0.90567964, "num_input_tokens_seen": 69587184, "router_z_loss_mlp": 0.17297363, "routerloss_mlp": 0.0, "step": 844, "time_per_iteration": 2.5846645832061768 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01157702, "balance_loss_mlp": 1.14051175, "diversity_loss_mlp": 0.0, "epoch": 0.16256252404771065, "flos": 497183132160.0, "grad_norm": 0.09201378669766774, "language_loss": 0.92096436, "learning_rate": 0.0009546275645814512, "loss": 0.93254137, "num_input_tokens_seen": 69656560, "router_z_loss_mlp": 0.17211914, "routerloss_mlp": 0.0, "step": 845, "time_per_iteration": 2.603830575942993 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01165367, "balance_loss_mlp": 1.1485343, "diversity_loss_mlp": 0.0, "epoch": 0.16275490573297421, "flos": 502344737280.0, "grad_norm": 0.11870998115484692, "language_loss": 0.8935858, "learning_rate": 0.0009544978002025446, "loss": 0.90523952, "num_input_tokens_seen": 69723872, "router_z_loss_mlp": 0.16833496, "routerloss_mlp": 0.0, "step": 846, "time_per_iteration": 2.57155179977417 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01167547, "balance_loss_mlp": 1.15075064, "diversity_loss_mlp": 0.0, "epoch": 0.16294728741823777, "flos": 507231756288.0, "grad_norm": 0.08095587687984966, "language_loss": 0.86639023, "learning_rate": 0.0009543678593719434, "loss": 0.87806571, "num_input_tokens_seen": 69795504, "router_z_loss_mlp": 0.16809082, "routerloss_mlp": 0.0, "step": 847, "time_per_iteration": 2.7022597789764404 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01189002, "balance_loss_mlp": 1.17215741, "diversity_loss_mlp": 0.0, "epoch": 0.16313966910350133, "flos": 509685364224.0, "grad_norm": 0.06757237913003537, "language_loss": 0.87374425, "learning_rate": 0.0009542377421400945, "loss": 0.8856343, "num_input_tokens_seen": 69873408, "router_z_loss_mlp": 0.1685791, "routerloss_mlp": 0.0, "step": 848, "time_per_iteration": 2.7858939170837402 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01209239, "balance_loss_mlp": 1.1922878, "diversity_loss_mlp": 0.0, "epoch": 0.16333205078876492, "flos": 543980427264.0, "grad_norm": 0.0709695929057924, "language_loss": 0.83489215, "learning_rate": 0.0009541074485575145, "loss": 0.84698457, "num_input_tokens_seen": 69944112, "router_z_loss_mlp": 0.16967773, "routerloss_mlp": 0.0, "step": 849, "time_per_iteration": 2.7202138900756836 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01206318, "balance_loss_mlp": 1.18949735, "diversity_loss_mlp": 0.0, "epoch": 0.16352443247402848, "flos": 507723655680.0, "grad_norm": 0.09796618546415216, "language_loss": 0.91934282, "learning_rate": 0.0009539769786747874, "loss": 0.93140602, "num_input_tokens_seen": 70012288, "router_z_loss_mlp": 0.16833496, "routerloss_mlp": 0.0, "step": 850, "time_per_iteration": 2.6165611743927 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01183142, "balance_loss_mlp": 1.16619003, "diversity_loss_mlp": 0.0, "epoch": 0.16371681415929204, "flos": 542124804096.0, "grad_norm": 0.08882238893928415, "language_loss": 0.81184316, "learning_rate": 0.0009538463325425665, "loss": 0.82367456, "num_input_tokens_seen": 70086560, "router_z_loss_mlp": 0.16967773, "routerloss_mlp": 0.0, "step": 851, "time_per_iteration": 2.686708927154541 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01150762, "balance_loss_mlp": 1.13394117, "diversity_loss_mlp": 0.0, "epoch": 0.1639091958445556, "flos": 520752291840.0, "grad_norm": 0.07439357185799754, "language_loss": 0.85950458, "learning_rate": 0.0009537155102115728, "loss": 0.87101221, "num_input_tokens_seen": 70153968, "router_z_loss_mlp": 0.16833496, "routerloss_mlp": 0.0, "step": 852, "time_per_iteration": 2.5918595790863037 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00875998, "balance_loss_mlp": 1.52336514, "diversity_loss_mlp": 0.19506347, "epoch": 0.16410157752981916, "flos": 547414889472.0, "grad_norm": 0.033648266618603755, "language_loss": 0.83653182, "learning_rate": 0.0009535845117325961, "loss": 0.84529185, "num_input_tokens_seen": 70222496, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0167836, "step": 853, "time_per_iteration": 2.724388599395752 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106481, "balance_loss_mlp": 1.08957744, "diversity_loss_mlp": 0.0, "epoch": 0.16429395921508272, "flos": 582853791744.0, "grad_norm": 0.08216353114673619, "language_loss": 0.93429655, "learning_rate": 0.0009534533371564946, "loss": 0.94536138, "num_input_tokens_seen": 70301680, "router_z_loss_mlp": 0.16918945, "routerloss_mlp": 0.0, "step": 854, "time_per_iteration": 2.7487661838531494 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.011031, "balance_loss_mlp": 1.08627963, "diversity_loss_mlp": 0.0, "epoch": 0.16448634090034628, "flos": 530934538752.0, "grad_norm": 0.1393079137823864, "language_loss": 0.88947123, "learning_rate": 0.0009533219865341949, "loss": 0.9005022, "num_input_tokens_seen": 70371152, "router_z_loss_mlp": 0.16833496, "routerloss_mlp": 0.0, "step": 855, "time_per_iteration": 2.5900051593780518 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095935, "balance_loss_mlp": 1.0794363, "diversity_loss_mlp": 0.0, "epoch": 0.16467872258560984, "flos": 491890475520.0, "grad_norm": 0.09213408499242232, "language_loss": 0.86629748, "learning_rate": 0.0009531904599166916, "loss": 0.87725687, "num_input_tokens_seen": 70440832, "router_z_loss_mlp": 0.16503906, "routerloss_mlp": 0.0, "step": 856, "time_per_iteration": 2.6516594886779785 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01093162, "balance_loss_mlp": 1.07659197, "diversity_loss_mlp": 0.0, "epoch": 0.16487110427087343, "flos": 506263385088.0, "grad_norm": 0.11803940214792888, "language_loss": 0.85319799, "learning_rate": 0.0009530587573550478, "loss": 0.86412966, "num_input_tokens_seen": 70507424, "router_z_loss_mlp": 0.16577148, "routerloss_mlp": 0.0, "step": 857, "time_per_iteration": 2.6046345233917236 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087423, "balance_loss_mlp": 1.06968486, "diversity_loss_mlp": 0.0, "epoch": 0.16506348595613698, "flos": 1432824712704.0, "grad_norm": 0.035898632567184195, "language_loss": 0.74319386, "learning_rate": 0.0009529268789003953, "loss": 0.75406808, "num_input_tokens_seen": 70742320, "router_z_loss_mlp": 0.17773438, "routerloss_mlp": 0.0, "step": 858, "time_per_iteration": 5.039424180984497 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0113669, "balance_loss_mlp": 1.12172914, "diversity_loss_mlp": 0.0, "epoch": 0.16525586764140054, "flos": 477129927168.0, "grad_norm": 0.11200047020164162, "language_loss": 0.90257657, "learning_rate": 0.0009527948246039337, "loss": 0.91394353, "num_input_tokens_seen": 70808400, "router_z_loss_mlp": 0.14929199, "routerloss_mlp": 0.0, "step": 859, "time_per_iteration": 2.550898551940918 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00912162, "balance_loss_mlp": 1.5939728, "diversity_loss_mlp": 0.19291875, "epoch": 0.1654482493266641, "flos": 881096942592.0, "grad_norm": 0.041813305841329106, "language_loss": 0.87981749, "learning_rate": 0.000952662594516931, "loss": 0.88893914, "num_input_tokens_seen": 70886192, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01871633, "step": 860, "time_per_iteration": 3.135986089706421 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01159964, "balance_loss_mlp": 1.14404976, "diversity_loss_mlp": 0.0, "epoch": 0.16564063101192766, "flos": 626841773568.0, "grad_norm": 0.09693666764449156, "language_loss": 0.86321676, "learning_rate": 0.0009525301886907234, "loss": 0.87481636, "num_input_tokens_seen": 70964816, "router_z_loss_mlp": 0.15905762, "routerloss_mlp": 0.0, "step": 861, "time_per_iteration": 2.8601465225219727 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0117936, "balance_loss_mlp": 1.16340995, "diversity_loss_mlp": 0.0, "epoch": 0.16583301269719122, "flos": 561518355456.0, "grad_norm": 0.08775979857040934, "language_loss": 0.87897611, "learning_rate": 0.0009523976071767155, "loss": 0.89076972, "num_input_tokens_seen": 71037456, "router_z_loss_mlp": 0.15942383, "routerloss_mlp": 0.0, "step": 862, "time_per_iteration": 2.676481246948242 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01186964, "balance_loss_mlp": 1.17058492, "diversity_loss_mlp": 0.0, "epoch": 0.16602539438245478, "flos": 567803976192.0, "grad_norm": 0.08829714099376759, "language_loss": 0.87565947, "learning_rate": 0.00095226485002638, "loss": 0.88752913, "num_input_tokens_seen": 71111872, "router_z_loss_mlp": 0.16381836, "routerloss_mlp": 0.0, "step": 863, "time_per_iteration": 2.7554168701171875 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01188442, "balance_loss_mlp": 1.17221785, "diversity_loss_mlp": 0.0, "epoch": 0.16621777606771834, "flos": 574875532800.0, "grad_norm": 0.07683945950910559, "language_loss": 0.89008975, "learning_rate": 0.0009521319172912576, "loss": 0.90197414, "num_input_tokens_seen": 71187808, "router_z_loss_mlp": 0.16223145, "routerloss_mlp": 0.0, "step": 864, "time_per_iteration": 2.7515084743499756 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01180456, "balance_loss_mlp": 1.16381395, "diversity_loss_mlp": 0.0, "epoch": 0.16641015775298193, "flos": 514552932864.0, "grad_norm": 0.07957847945510911, "language_loss": 0.95031559, "learning_rate": 0.0009519988090229579, "loss": 0.96212018, "num_input_tokens_seen": 71261728, "router_z_loss_mlp": 0.16650391, "routerloss_mlp": 0.0, "step": 865, "time_per_iteration": 2.671473741531372 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01177408, "balance_loss_mlp": 1.16058719, "diversity_loss_mlp": 0.0, "epoch": 0.1666025394382455, "flos": 621685310976.0, "grad_norm": 0.08787110668844439, "language_loss": 0.87748879, "learning_rate": 0.0009518655252731576, "loss": 0.8892628, "num_input_tokens_seen": 71338352, "router_z_loss_mlp": 0.16833496, "routerloss_mlp": 0.0, "step": 866, "time_per_iteration": 2.7561991214752197 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01152051, "balance_loss_mlp": 1.13470602, "diversity_loss_mlp": 0.0, "epoch": 0.16679492112350905, "flos": 548808348672.0, "grad_norm": 0.07641565274747647, "language_loss": 0.90193641, "learning_rate": 0.0009517320660936022, "loss": 0.91345698, "num_input_tokens_seen": 71416544, "router_z_loss_mlp": 0.17358398, "routerloss_mlp": 0.0, "step": 867, "time_per_iteration": 2.7005693912506104 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01177189, "balance_loss_mlp": 1.16064239, "diversity_loss_mlp": 0.0, "epoch": 0.1669873028087726, "flos": 665675864064.0, "grad_norm": 0.08424262891613502, "language_loss": 0.83321446, "learning_rate": 0.0009515984315361051, "loss": 0.84498632, "num_input_tokens_seen": 71494080, "router_z_loss_mlp": 0.16552734, "routerloss_mlp": 0.0, "step": 868, "time_per_iteration": 2.7969586849212646 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01167914, "balance_loss_mlp": 1.15145087, "diversity_loss_mlp": 0.0, "epoch": 0.16717968449403617, "flos": 538564432896.0, "grad_norm": 0.08829416831991993, "language_loss": 0.87132847, "learning_rate": 0.000951464621652548, "loss": 0.88300765, "num_input_tokens_seen": 71562672, "router_z_loss_mlp": 0.16467285, "routerloss_mlp": 0.0, "step": 869, "time_per_iteration": 2.6121644973754883 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01152825, "balance_loss_mlp": 1.13639808, "diversity_loss_mlp": 0.0, "epoch": 0.16737206617929973, "flos": 530121438720.0, "grad_norm": 0.07099792340868973, "language_loss": 0.79077303, "learning_rate": 0.0009513306364948804, "loss": 0.80230129, "num_input_tokens_seen": 71641904, "router_z_loss_mlp": 0.16430664, "routerloss_mlp": 0.0, "step": 870, "time_per_iteration": 2.7814862728118896 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01140954, "balance_loss_mlp": 1.12481356, "diversity_loss_mlp": 0.0, "epoch": 0.1675644478645633, "flos": 480774362112.0, "grad_norm": 0.09401721418936884, "language_loss": 0.89126736, "learning_rate": 0.0009511964761151197, "loss": 0.90267694, "num_input_tokens_seen": 71709616, "router_z_loss_mlp": 0.16137695, "routerloss_mlp": 0.0, "step": 871, "time_per_iteration": 2.601903200149536 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01152354, "balance_loss_mlp": 1.13628435, "diversity_loss_mlp": 0.0, "epoch": 0.16775682954982685, "flos": 494556627456.0, "grad_norm": 0.07594901152089473, "language_loss": 0.90430808, "learning_rate": 0.0009510621405653521, "loss": 0.91583163, "num_input_tokens_seen": 71776592, "router_z_loss_mlp": 0.16064453, "routerloss_mlp": 0.0, "step": 872, "time_per_iteration": 2.6015260219573975 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01140995, "balance_loss_mlp": 1.12449682, "diversity_loss_mlp": 0.0, "epoch": 0.1679492112350904, "flos": 752035912704.0, "grad_norm": 0.08553354640914074, "language_loss": 0.84159112, "learning_rate": 0.0009509276298977309, "loss": 0.85300112, "num_input_tokens_seen": 71856352, "router_z_loss_mlp": 0.16503906, "routerloss_mlp": 0.0, "step": 873, "time_per_iteration": 2.979609251022339 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01156157, "balance_loss_mlp": 1.13969469, "diversity_loss_mlp": 0.0, "epoch": 0.168141592920354, "flos": 1135875571200.0, "grad_norm": 0.09960357111836311, "language_loss": 0.81973028, "learning_rate": 0.0009507929441644778, "loss": 0.83129185, "num_input_tokens_seen": 71948480, "router_z_loss_mlp": 0.16467285, "routerloss_mlp": 0.0, "step": 874, "time_per_iteration": 3.518749237060547 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01141075, "balance_loss_mlp": 1.12455297, "diversity_loss_mlp": 0.0, "epoch": 0.16833397460561755, "flos": 632401302528.0, "grad_norm": 0.09789550875526438, "language_loss": 0.86003464, "learning_rate": 0.0009506580834178826, "loss": 0.87144536, "num_input_tokens_seen": 72019200, "router_z_loss_mlp": 0.1652832, "routerloss_mlp": 0.0, "step": 875, "time_per_iteration": 2.7423431873321533 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01152406, "balance_loss_mlp": 1.13565707, "diversity_loss_mlp": 0.0, "epoch": 0.1685263562908811, "flos": 541445326848.0, "grad_norm": 0.08790070613593892, "language_loss": 0.91631377, "learning_rate": 0.0009505230477103028, "loss": 0.92783785, "num_input_tokens_seen": 72088672, "router_z_loss_mlp": 0.16760254, "routerloss_mlp": 0.0, "step": 876, "time_per_iteration": 2.698725938796997 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01133355, "balance_loss_mlp": 1.11677289, "diversity_loss_mlp": 0.0, "epoch": 0.16871873797614467, "flos": 619325678592.0, "grad_norm": 0.09908277874944699, "language_loss": 0.81365788, "learning_rate": 0.0009503878370941641, "loss": 0.82499135, "num_input_tokens_seen": 72159952, "router_z_loss_mlp": 0.16589355, "routerloss_mlp": 0.0, "step": 877, "time_per_iteration": 2.791314125061035 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00891363, "balance_loss_mlp": 1.54620337, "diversity_loss_mlp": 0.20141272, "epoch": 0.16891111966140823, "flos": 606344030208.0, "grad_norm": 0.04203797903351432, "language_loss": 0.89092785, "learning_rate": 0.0009502524516219595, "loss": 0.89984149, "num_input_tokens_seen": 72231648, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01755447, "step": 878, "time_per_iteration": 2.776076078414917 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01143141, "balance_loss_mlp": 1.12719083, "diversity_loss_mlp": 0.0, "epoch": 0.1691035013466718, "flos": 552326874624.0, "grad_norm": 0.08982042340710936, "language_loss": 0.90123284, "learning_rate": 0.0009501168913462506, "loss": 0.91266429, "num_input_tokens_seen": 72298608, "router_z_loss_mlp": 0.15942383, "routerloss_mlp": 0.0, "step": 879, "time_per_iteration": 2.6948277950286865 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112281, "balance_loss_mlp": 1.09587741, "diversity_loss_mlp": 0.0, "epoch": 0.16929588303193535, "flos": 1476294377472.0, "grad_norm": 0.05096984028598956, "language_loss": 0.79121923, "learning_rate": 0.0009499811563196665, "loss": 0.80234206, "num_input_tokens_seen": 72525312, "router_z_loss_mlp": 0.1640625, "routerloss_mlp": 0.0, "step": 880, "time_per_iteration": 4.850466728210449 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01143836, "balance_loss_mlp": 1.12831497, "diversity_loss_mlp": 0.0, "epoch": 0.1694882647171989, "flos": 926248587264.0, "grad_norm": 0.08080936273118028, "language_loss": 0.85235959, "learning_rate": 0.0009498452465949042, "loss": 0.8637979, "num_input_tokens_seen": 72612976, "router_z_loss_mlp": 0.1550293, "routerloss_mlp": 0.0, "step": 881, "time_per_iteration": 3.2163655757904053 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01147334, "balance_loss_mlp": 1.13156271, "diversity_loss_mlp": 0.0, "epoch": 0.1696806464024625, "flos": 546093010944.0, "grad_norm": 0.06875421208466073, "language_loss": 0.91363323, "learning_rate": 0.0009497091622247285, "loss": 0.92510653, "num_input_tokens_seen": 72686800, "router_z_loss_mlp": 0.15759277, "routerloss_mlp": 0.0, "step": 882, "time_per_iteration": 2.686939239501953 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01152935, "balance_loss_mlp": 1.13735437, "diversity_loss_mlp": 0.0, "epoch": 0.16987302808772606, "flos": 529234560000.0, "grad_norm": 0.08376903723107024, "language_loss": 0.93688583, "learning_rate": 0.0009495729032619723, "loss": 0.94841516, "num_input_tokens_seen": 72759360, "router_z_loss_mlp": 0.15563965, "routerloss_mlp": 0.0, "step": 883, "time_per_iteration": 2.709554433822632 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01164282, "balance_loss_mlp": 1.14845097, "diversity_loss_mlp": 0.0, "epoch": 0.17006540977298962, "flos": 755178909696.0, "grad_norm": 0.07836441801613908, "language_loss": 0.83897853, "learning_rate": 0.0009494364697595354, "loss": 0.85062128, "num_input_tokens_seen": 72831424, "router_z_loss_mlp": 0.15820312, "routerloss_mlp": 0.0, "step": 884, "time_per_iteration": 2.905869722366333 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01192457, "balance_loss_mlp": 1.17685246, "diversity_loss_mlp": 0.0, "epoch": 0.17025779145825318, "flos": 558800446464.0, "grad_norm": 0.08347533231949411, "language_loss": 0.89193916, "learning_rate": 0.0009492998617703867, "loss": 0.90386373, "num_input_tokens_seen": 72901536, "router_z_loss_mlp": 0.15588379, "routerloss_mlp": 0.0, "step": 885, "time_per_iteration": 2.655181884765625 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01196193, "balance_loss_mlp": 1.18021917, "diversity_loss_mlp": 0.0, "epoch": 0.17045017314351674, "flos": 512213124096.0, "grad_norm": 0.09597329726050118, "language_loss": 0.87667245, "learning_rate": 0.0009491630793475619, "loss": 0.88863432, "num_input_tokens_seen": 72970480, "router_z_loss_mlp": 0.15966797, "routerloss_mlp": 0.0, "step": 886, "time_per_iteration": 2.6077725887298584 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01195953, "balance_loss_mlp": 1.17983615, "diversity_loss_mlp": 0.0, "epoch": 0.1706425548287803, "flos": 508941646848.0, "grad_norm": 0.09161300078510141, "language_loss": 0.8529889, "learning_rate": 0.0009490261225441643, "loss": 0.86494851, "num_input_tokens_seen": 73053376, "router_z_loss_mlp": 0.16113281, "routerloss_mlp": 0.0, "step": 887, "time_per_iteration": 2.8882617950439453 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01169082, "balance_loss_mlp": 1.15244031, "diversity_loss_mlp": 0.0, "epoch": 0.17083493651404386, "flos": 717355408896.0, "grad_norm": 0.07944379291645969, "language_loss": 0.90366387, "learning_rate": 0.0009488889914133656, "loss": 0.91535467, "num_input_tokens_seen": 73136032, "router_z_loss_mlp": 0.16638184, "routerloss_mlp": 0.0, "step": 888, "time_per_iteration": 2.969808578491211 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01192276, "balance_loss_mlp": 1.17532432, "diversity_loss_mlp": 0.0, "epoch": 0.17102731819930742, "flos": 559121647104.0, "grad_norm": 0.0816216626447537, "language_loss": 0.89335579, "learning_rate": 0.0009487516860084047, "loss": 0.90527856, "num_input_tokens_seen": 73208544, "router_z_loss_mlp": 0.16955566, "routerloss_mlp": 0.0, "step": 889, "time_per_iteration": 2.6975717544555664 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01164555, "balance_loss_mlp": 1.14738929, "diversity_loss_mlp": 0.0, "epoch": 0.17121969988457098, "flos": 494786423808.0, "grad_norm": 0.08956429914743876, "language_loss": 0.88835347, "learning_rate": 0.0009486142063825884, "loss": 0.89999902, "num_input_tokens_seen": 73274336, "router_z_loss_mlp": 0.17175293, "routerloss_mlp": 0.0, "step": 890, "time_per_iteration": 2.5376908779144287 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087842, "balance_loss_mlp": 1.07248783, "diversity_loss_mlp": 0.0, "epoch": 0.17141208156983456, "flos": 1548889413120.0, "grad_norm": 0.041165905845677725, "language_loss": 0.72426212, "learning_rate": 0.0009484765525892909, "loss": 0.73514056, "num_input_tokens_seen": 73506320, "router_z_loss_mlp": 0.15332031, "routerloss_mlp": 0.0, "step": 891, "time_per_iteration": 4.961901664733887 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01168071, "balance_loss_mlp": 1.15150142, "diversity_loss_mlp": 0.0, "epoch": 0.17160446325509812, "flos": 619565386752.0, "grad_norm": 0.09530662242326329, "language_loss": 0.89790797, "learning_rate": 0.0009483387246819542, "loss": 0.90958869, "num_input_tokens_seen": 73578048, "router_z_loss_mlp": 0.16577148, "routerloss_mlp": 0.0, "step": 892, "time_per_iteration": 2.7075483798980713 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063774, "balance_loss_mlp": 1.0489924, "diversity_loss_mlp": 0.0, "epoch": 0.17179684494036168, "flos": 1381758206976.0, "grad_norm": 0.03173229244132217, "language_loss": 0.82285583, "learning_rate": 0.0009482007227140877, "loss": 0.83349359, "num_input_tokens_seen": 73798640, "router_z_loss_mlp": 0.14746094, "routerloss_mlp": 0.0, "step": 893, "time_per_iteration": 4.639479398727417 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01175334, "balance_loss_mlp": 1.15915704, "diversity_loss_mlp": 0.0, "epoch": 0.17198922662562524, "flos": 492636764160.0, "grad_norm": 0.09568003043121609, "language_loss": 0.88799989, "learning_rate": 0.0009480625467392688, "loss": 0.89975327, "num_input_tokens_seen": 73867328, "router_z_loss_mlp": 0.16174316, "routerloss_mlp": 0.0, "step": 894, "time_per_iteration": 2.6601061820983887 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062971, "balance_loss_mlp": 1.04933381, "diversity_loss_mlp": 0.0, "epoch": 0.1721816083108888, "flos": 1458318878208.0, "grad_norm": 0.02668432598653126, "language_loss": 0.77994668, "learning_rate": 0.0009479241968111421, "loss": 0.79057646, "num_input_tokens_seen": 74093376, "router_z_loss_mlp": 0.13671875, "routerloss_mlp": 0.0, "step": 895, "time_per_iteration": 4.739619970321655 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01154117, "balance_loss_mlp": 1.13857174, "diversity_loss_mlp": 0.0, "epoch": 0.17237398999615236, "flos": 528122654208.0, "grad_norm": 0.0641043143423189, "language_loss": 0.87743723, "learning_rate": 0.0009477856729834196, "loss": 0.88897842, "num_input_tokens_seen": 74169136, "router_z_loss_mlp": 0.15527344, "routerloss_mlp": 0.0, "step": 896, "time_per_iteration": 2.7397632598876953 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01143695, "balance_loss_mlp": 1.12863934, "diversity_loss_mlp": 0.0, "epoch": 0.17256637168141592, "flos": 603920157696.0, "grad_norm": 0.08265751895316475, "language_loss": 0.89999056, "learning_rate": 0.0009476469753098809, "loss": 0.9114275, "num_input_tokens_seen": 74236912, "router_z_loss_mlp": 0.15026855, "routerloss_mlp": 0.0, "step": 897, "time_per_iteration": 2.7494678497314453 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01151499, "balance_loss_mlp": 1.13624024, "diversity_loss_mlp": 0.0, "epoch": 0.17275875336667948, "flos": 509687935488.0, "grad_norm": 0.08701823937514089, "language_loss": 0.86839932, "learning_rate": 0.0009475081038443738, "loss": 0.87991428, "num_input_tokens_seen": 74305968, "router_z_loss_mlp": 0.15234375, "routerloss_mlp": 0.0, "step": 898, "time_per_iteration": 2.6241486072540283 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01147135, "balance_loss_mlp": 1.13179302, "diversity_loss_mlp": 0.0, "epoch": 0.17295113505194307, "flos": 665260687872.0, "grad_norm": 0.10104724937619765, "language_loss": 0.85756111, "learning_rate": 0.0009473690586408124, "loss": 0.86903244, "num_input_tokens_seen": 74384144, "router_z_loss_mlp": 0.15319824, "routerloss_mlp": 0.0, "step": 899, "time_per_iteration": 2.8371973037719727 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01141451, "balance_loss_mlp": 1.1257633, "diversity_loss_mlp": 0.0, "epoch": 0.17314351673720663, "flos": 555385807872.0, "grad_norm": 0.08019640817702944, "language_loss": 0.86364079, "learning_rate": 0.0009472298397531792, "loss": 0.87505525, "num_input_tokens_seen": 74455040, "router_z_loss_mlp": 0.15673828, "routerloss_mlp": 0.0, "step": 900, "time_per_iteration": 2.742392063140869 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01158392, "balance_loss_mlp": 1.14285886, "diversity_loss_mlp": 0.0, "epoch": 0.17333589842247019, "flos": 503609716224.0, "grad_norm": 0.08623310667606855, "language_loss": 0.86846912, "learning_rate": 0.0009470904472355235, "loss": 0.88005304, "num_input_tokens_seen": 74525248, "router_z_loss_mlp": 0.15515137, "routerloss_mlp": 0.0, "step": 901, "time_per_iteration": 2.6695165634155273 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01168499, "balance_loss_mlp": 1.15235806, "diversity_loss_mlp": 0.0, "epoch": 0.17352828010773375, "flos": 556208446464.0, "grad_norm": 0.08505658620970231, "language_loss": 0.7976377, "learning_rate": 0.0009469508811419626, "loss": 0.80932266, "num_input_tokens_seen": 74597328, "router_z_loss_mlp": 0.16137695, "routerloss_mlp": 0.0, "step": 902, "time_per_iteration": 2.706495761871338 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01295395, "balance_loss_mlp": 1.28533375, "diversity_loss_mlp": 0.0, "epoch": 0.1737206617929973, "flos": 1554525292032.0, "grad_norm": 0.12561294289393785, "language_loss": 0.7161383, "learning_rate": 0.0009468111415266806, "loss": 0.72909224, "num_input_tokens_seen": 74819664, "router_z_loss_mlp": 0.10058594, "routerloss_mlp": 0.0, "step": 903, "time_per_iteration": 4.816544532775879 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01201232, "balance_loss_mlp": 1.18432808, "diversity_loss_mlp": 0.0, "epoch": 0.17391304347826086, "flos": 516662945280.0, "grad_norm": 0.08260915403461032, "language_loss": 0.83578205, "learning_rate": 0.0009466712284439292, "loss": 0.84779429, "num_input_tokens_seen": 74896224, "router_z_loss_mlp": 0.16918945, "routerloss_mlp": 0.0, "step": 904, "time_per_iteration": 2.7518186569213867 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01225673, "balance_loss_mlp": 1.20837545, "diversity_loss_mlp": 0.0, "epoch": 0.17410542516352442, "flos": 541049974272.0, "grad_norm": 0.10172065741669829, "language_loss": 0.88445127, "learning_rate": 0.0009465311419480276, "loss": 0.89670801, "num_input_tokens_seen": 74966560, "router_z_loss_mlp": 0.1730957, "routerloss_mlp": 0.0, "step": 905, "time_per_iteration": 2.6713294982910156 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01222896, "balance_loss_mlp": 1.20540833, "diversity_loss_mlp": 0.0, "epoch": 0.17429780684878798, "flos": 623849651712.0, "grad_norm": 0.08928567213571854, "language_loss": 0.88188136, "learning_rate": 0.0009463908820933622, "loss": 0.89411032, "num_input_tokens_seen": 75045248, "router_z_loss_mlp": 0.17492676, "routerloss_mlp": 0.0, "step": 906, "time_per_iteration": 2.838935375213623 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01211371, "balance_loss_mlp": 1.19455028, "diversity_loss_mlp": 0.0, "epoch": 0.17449018853405157, "flos": 575663666688.0, "grad_norm": 0.07641026648080583, "language_loss": 0.82561022, "learning_rate": 0.0009462504489343868, "loss": 0.83772391, "num_input_tokens_seen": 75123952, "router_z_loss_mlp": 0.16821289, "routerloss_mlp": 0.0, "step": 907, "time_per_iteration": 2.814695119857788 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01176767, "balance_loss_mlp": 1.15961313, "diversity_loss_mlp": 0.0, "epoch": 0.17468257021931513, "flos": 533753763840.0, "grad_norm": 0.1031074016814366, "language_loss": 0.88790941, "learning_rate": 0.0009461098425256222, "loss": 0.89967716, "num_input_tokens_seen": 75191728, "router_z_loss_mlp": 0.17175293, "routerloss_mlp": 0.0, "step": 908, "time_per_iteration": 2.6116297245025635 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01159634, "balance_loss_mlp": 1.14329028, "diversity_loss_mlp": 0.0, "epoch": 0.1748749519045787, "flos": 540758509056.0, "grad_norm": 0.08015161116044169, "language_loss": 0.86030436, "learning_rate": 0.0009459690629216567, "loss": 0.87190068, "num_input_tokens_seen": 75262224, "router_z_loss_mlp": 0.16345215, "routerloss_mlp": 0.0, "step": 909, "time_per_iteration": 2.6483752727508545 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130085, "balance_loss_mlp": 1.11407518, "diversity_loss_mlp": 0.0, "epoch": 0.17506733358984225, "flos": 498623579136.0, "grad_norm": 0.1301831169035446, "language_loss": 0.87761313, "learning_rate": 0.0009458281101771457, "loss": 0.88891399, "num_input_tokens_seen": 75329760, "router_z_loss_mlp": 0.16003418, "routerloss_mlp": 0.0, "step": 910, "time_per_iteration": 2.6089227199554443 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00992009, "balance_loss_mlp": 1.75545192, "diversity_loss_mlp": 0.19214596, "epoch": 0.1752597152751058, "flos": 622923125760.0, "grad_norm": 0.033219305186726854, "language_loss": 0.82887536, "learning_rate": 0.0009456869843468122, "loss": 0.83879542, "num_input_tokens_seen": 75407920, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01820984, "step": 911, "time_per_iteration": 2.895577907562256 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110935, "balance_loss_mlp": 1.09519958, "diversity_loss_mlp": 0.0, "epoch": 0.17545209696036937, "flos": 520972176384.0, "grad_norm": 0.09801228329993106, "language_loss": 0.78689641, "learning_rate": 0.0009455456854854459, "loss": 0.79800576, "num_input_tokens_seen": 75476752, "router_z_loss_mlp": 0.15722656, "routerloss_mlp": 0.0, "step": 912, "time_per_iteration": 2.61677885055542 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112332, "balance_loss_mlp": 1.09684718, "diversity_loss_mlp": 0.0, "epoch": 0.17564447864563293, "flos": 461988707328.0, "grad_norm": 0.10345929433375275, "language_loss": 0.84027654, "learning_rate": 0.0009454042136479039, "loss": 0.8513999, "num_input_tokens_seen": 75542944, "router_z_loss_mlp": 0.15466309, "routerloss_mlp": 0.0, "step": 913, "time_per_iteration": 2.63289737701416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00970368, "balance_loss_mlp": 1.71473479, "diversity_loss_mlp": 0.18966624, "epoch": 0.1758368603308965, "flos": 480655793664.0, "grad_norm": 0.036406885856323776, "language_loss": 0.82874572, "learning_rate": 0.0009452625688891103, "loss": 0.83844936, "num_input_tokens_seen": 75609840, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01816791, "step": 914, "time_per_iteration": 2.5505056381225586 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00652668, "balance_loss_mlp": 1.1176697, "diversity_loss_mlp": 0.15453993, "epoch": 0.17602924201616005, "flos": 1478942903808.0, "grad_norm": 0.002103211778310914, "language_loss": 0.78734738, "learning_rate": 0.0009451207512640567, "loss": 0.79387403, "num_input_tokens_seen": 75819312, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01656273, "step": 915, "time_per_iteration": 4.6835761070251465 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138887, "balance_loss_mlp": 1.12381876, "diversity_loss_mlp": 0.0, "epoch": 0.17622162370142364, "flos": 602301671424.0, "grad_norm": 0.10180381633640839, "language_loss": 0.92940623, "learning_rate": 0.0009449787608278015, "loss": 0.94079512, "num_input_tokens_seen": 75893984, "router_z_loss_mlp": 0.1505127, "routerloss_mlp": 0.0, "step": 916, "time_per_iteration": 2.7294180393218994 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01155245, "balance_loss_mlp": 1.13949776, "diversity_loss_mlp": 0.0, "epoch": 0.1764140053866872, "flos": 442699043328.0, "grad_norm": 0.08481056496958321, "language_loss": 0.92318904, "learning_rate": 0.0009448365976354704, "loss": 0.9347415, "num_input_tokens_seen": 75958944, "router_z_loss_mlp": 0.15734863, "routerloss_mlp": 0.0, "step": 917, "time_per_iteration": 2.4908158779144287 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01174187, "balance_loss_mlp": 1.15821338, "diversity_loss_mlp": 0.0, "epoch": 0.17660638707195075, "flos": 500607682560.0, "grad_norm": 0.1031397623895646, "language_loss": 0.89928877, "learning_rate": 0.0009446942617422558, "loss": 0.91103065, "num_input_tokens_seen": 76024240, "router_z_loss_mlp": 0.15966797, "routerloss_mlp": 0.0, "step": 918, "time_per_iteration": 2.5721499919891357 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01191219, "balance_loss_mlp": 1.1748755, "diversity_loss_mlp": 0.0, "epoch": 0.17679876875721431, "flos": 538892974080.0, "grad_norm": 0.17804953788653613, "language_loss": 0.85687363, "learning_rate": 0.0009445517532034176, "loss": 0.86878586, "num_input_tokens_seen": 76095264, "router_z_loss_mlp": 0.16345215, "routerloss_mlp": 0.0, "step": 919, "time_per_iteration": 2.6613845825195312 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01195517, "balance_loss_mlp": 1.18031824, "diversity_loss_mlp": 0.0, "epoch": 0.17699115044247787, "flos": 497724217344.0, "grad_norm": 0.09678678856513988, "language_loss": 0.89147103, "learning_rate": 0.0009444090720742824, "loss": 0.90342629, "num_input_tokens_seen": 76163520, "router_z_loss_mlp": 0.15185547, "routerloss_mlp": 0.0, "step": 920, "time_per_iteration": 2.587042808532715 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01186456, "balance_loss_mlp": 1.17107785, "diversity_loss_mlp": 0.0, "epoch": 0.17718353212774143, "flos": 662738070528.0, "grad_norm": 0.10185153476697495, "language_loss": 0.87654328, "learning_rate": 0.0009442662184102439, "loss": 0.88840789, "num_input_tokens_seen": 76233760, "router_z_loss_mlp": 0.15368652, "routerloss_mlp": 0.0, "step": 921, "time_per_iteration": 2.8263702392578125 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01153869, "balance_loss_mlp": 1.13851511, "diversity_loss_mlp": 0.0, "epoch": 0.177375913813005, "flos": 582641247744.0, "grad_norm": 0.07023953845341, "language_loss": 0.87764925, "learning_rate": 0.000944123192266763, "loss": 0.88918793, "num_input_tokens_seen": 76310704, "router_z_loss_mlp": 0.15344238, "routerloss_mlp": 0.0, "step": 922, "time_per_iteration": 2.789288282394409 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00914197, "balance_loss_mlp": 1.60349846, "diversity_loss_mlp": 0.18745996, "epoch": 0.17756829549826855, "flos": 552564011520.0, "grad_norm": 0.03372690713262746, "language_loss": 0.83555657, "learning_rate": 0.0009439799936993671, "loss": 0.84469855, "num_input_tokens_seen": 76386992, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01871805, "step": 923, "time_per_iteration": 2.7374520301818848 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01137351, "balance_loss_mlp": 1.12125802, "diversity_loss_mlp": 0.0, "epoch": 0.17776067718353214, "flos": 556322245632.0, "grad_norm": 0.08202300708599226, "language_loss": 0.87886107, "learning_rate": 0.0009438366227636511, "loss": 0.89023459, "num_input_tokens_seen": 76453328, "router_z_loss_mlp": 0.16088867, "routerloss_mlp": 0.0, "step": 924, "time_per_iteration": 2.7159595489501953 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01148154, "balance_loss_mlp": 1.13190556, "diversity_loss_mlp": 0.0, "epoch": 0.1779530588687957, "flos": 658458574848.0, "grad_norm": 0.08035818105278464, "language_loss": 0.86048192, "learning_rate": 0.0009436930795152763, "loss": 0.8719635, "num_input_tokens_seen": 76529040, "router_z_loss_mlp": 0.16247559, "routerloss_mlp": 0.0, "step": 925, "time_per_iteration": 2.8248116970062256 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01143318, "balance_loss_mlp": 1.12739205, "diversity_loss_mlp": 0.0, "epoch": 0.17814544055405926, "flos": 644483589120.0, "grad_norm": 0.07405817727017547, "language_loss": 0.86317486, "learning_rate": 0.0009435493640099713, "loss": 0.87460804, "num_input_tokens_seen": 76604080, "router_z_loss_mlp": 0.15917969, "routerloss_mlp": 0.0, "step": 926, "time_per_iteration": 2.8155741691589355 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01161834, "balance_loss_mlp": 1.1451211, "diversity_loss_mlp": 0.0, "epoch": 0.17833782223932282, "flos": 460913877504.0, "grad_norm": 0.09122083849675254, "language_loss": 0.84453332, "learning_rate": 0.0009434054763035314, "loss": 0.8561517, "num_input_tokens_seen": 76674096, "router_z_loss_mlp": 0.16723633, "routerloss_mlp": 0.0, "step": 927, "time_per_iteration": 2.636686325073242 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01158411, "balance_loss_mlp": 1.1422224, "diversity_loss_mlp": 0.0, "epoch": 0.17853020392458638, "flos": 759539897856.0, "grad_norm": 0.0663266274239875, "language_loss": 0.85362542, "learning_rate": 0.0009432614164518185, "loss": 0.86520946, "num_input_tokens_seen": 76752144, "router_z_loss_mlp": 0.16186523, "routerloss_mlp": 0.0, "step": 928, "time_per_iteration": 2.9446685314178467 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01171163, "balance_loss_mlp": 1.15443754, "diversity_loss_mlp": 0.0, "epoch": 0.17872258560984994, "flos": 782666717184.0, "grad_norm": 0.07726522608444414, "language_loss": 0.84178561, "learning_rate": 0.000943117184510762, "loss": 0.85349721, "num_input_tokens_seen": 76830240, "router_z_loss_mlp": 0.1673584, "routerloss_mlp": 0.0, "step": 929, "time_per_iteration": 3.0194530487060547 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01175374, "balance_loss_mlp": 1.16435885, "diversity_loss_mlp": 0.0, "epoch": 0.1789149672951135, "flos": 1459880464896.0, "grad_norm": 0.030831515732685378, "language_loss": 0.78789961, "learning_rate": 0.0009429727805363575, "loss": 0.79965341, "num_input_tokens_seen": 77062464, "router_z_loss_mlp": 0.11035156, "routerloss_mlp": 0.0, "step": 930, "time_per_iteration": 5.04656982421875 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01172004, "balance_loss_mlp": 1.15555263, "diversity_loss_mlp": 0.0, "epoch": 0.17910734898037706, "flos": 503864105472.0, "grad_norm": 0.08209248711818126, "language_loss": 0.88495553, "learning_rate": 0.0009428282045846674, "loss": 0.89667559, "num_input_tokens_seen": 77136672, "router_z_loss_mlp": 0.16455078, "routerloss_mlp": 0.0, "step": 931, "time_per_iteration": 2.6833221912384033 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00905029, "balance_loss_mlp": 1.58147573, "diversity_loss_mlp": 0.18920106, "epoch": 0.17929973066564064, "flos": 746249158656.0, "grad_norm": 0.030391877730158674, "language_loss": 0.89804769, "learning_rate": 0.0009426834567118214, "loss": 0.90709794, "num_input_tokens_seen": 77227040, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01969042, "step": 932, "time_per_iteration": 3.0804004669189453 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01174106, "balance_loss_mlp": 1.15761924, "diversity_loss_mlp": 0.0, "epoch": 0.1794921123509042, "flos": 713214305280.0, "grad_norm": 0.06967623980831897, "language_loss": 0.80600739, "learning_rate": 0.0009425385369740155, "loss": 0.81774843, "num_input_tokens_seen": 77319392, "router_z_loss_mlp": 0.16491699, "routerloss_mlp": 0.0, "step": 933, "time_per_iteration": 3.039576530456543 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01172613, "balance_loss_mlp": 1.15553069, "diversity_loss_mlp": 0.0, "epoch": 0.17968449403616776, "flos": 633142448640.0, "grad_norm": 0.09198882046168515, "language_loss": 0.87049097, "learning_rate": 0.0009423934454275125, "loss": 0.88221705, "num_input_tokens_seen": 77394688, "router_z_loss_mlp": 0.17102051, "routerloss_mlp": 0.0, "step": 934, "time_per_iteration": 2.8528192043304443 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01147429, "balance_loss_mlp": 1.13053656, "diversity_loss_mlp": 0.0, "epoch": 0.17987687572143132, "flos": 536323368960.0, "grad_norm": 0.09002999058802562, "language_loss": 0.92077851, "learning_rate": 0.0009422481821286418, "loss": 0.93225282, "num_input_tokens_seen": 77468288, "router_z_loss_mlp": 0.16906738, "routerloss_mlp": 0.0, "step": 935, "time_per_iteration": 2.720700740814209 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01140916, "balance_loss_mlp": 1.12434602, "diversity_loss_mlp": 0.0, "epoch": 0.18006925740669488, "flos": 538077676032.0, "grad_norm": 0.11818586168906865, "language_loss": 0.88474637, "learning_rate": 0.0009421027471337998, "loss": 0.89615548, "num_input_tokens_seen": 77535840, "router_z_loss_mlp": 0.16577148, "routerloss_mlp": 0.0, "step": 936, "time_per_iteration": 2.61820125579834 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0114364, "balance_loss_mlp": 1.12680769, "diversity_loss_mlp": 0.0, "epoch": 0.18026163909195844, "flos": 539510782464.0, "grad_norm": 0.13119105141522364, "language_loss": 0.82430404, "learning_rate": 0.0009419571404994493, "loss": 0.83574045, "num_input_tokens_seen": 77604000, "router_z_loss_mlp": 0.16845703, "routerloss_mlp": 0.0, "step": 937, "time_per_iteration": 2.6458749771118164 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126757, "balance_loss_mlp": 1.11016333, "diversity_loss_mlp": 0.0, "epoch": 0.180454020777222, "flos": 500642187264.0, "grad_norm": 0.10011425098636609, "language_loss": 0.90748799, "learning_rate": 0.00094181136228212, "loss": 0.91875559, "num_input_tokens_seen": 77671488, "router_z_loss_mlp": 0.16589355, "routerloss_mlp": 0.0, "step": 938, "time_per_iteration": 2.659946918487549 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01132333, "balance_loss_mlp": 1.11602521, "diversity_loss_mlp": 0.0, "epoch": 0.18064640246248556, "flos": 498952120320.0, "grad_norm": 0.06984091109722412, "language_loss": 0.86027002, "learning_rate": 0.0009416654125384077, "loss": 0.8715933, "num_input_tokens_seen": 77746240, "router_z_loss_mlp": 0.16308594, "routerloss_mlp": 0.0, "step": 939, "time_per_iteration": 2.723839044570923 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01182476, "balance_loss_mlp": 1.17174697, "diversity_loss_mlp": 0.0, "epoch": 0.18083878414774912, "flos": 1519313988096.0, "grad_norm": 0.0414358910702132, "language_loss": 0.79772377, "learning_rate": 0.0009415192913249752, "loss": 0.8095485, "num_input_tokens_seen": 77966080, "router_z_loss_mlp": 0.10742188, "routerloss_mlp": 0.0, "step": 940, "time_per_iteration": 4.920511722564697 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01141007, "balance_loss_mlp": 1.12453222, "diversity_loss_mlp": 0.0, "epoch": 0.1810311658330127, "flos": 727337594880.0, "grad_norm": 0.0813056862192268, "language_loss": 0.83903325, "learning_rate": 0.000941372998698552, "loss": 0.85044336, "num_input_tokens_seen": 78049200, "router_z_loss_mlp": 0.16479492, "routerloss_mlp": 0.0, "step": 941, "time_per_iteration": 2.937645673751831 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00896978, "balance_loss_mlp": 1.56833267, "diversity_loss_mlp": 0.1911485, "epoch": 0.18122354751827627, "flos": 564923082240.0, "grad_norm": 0.04191931915848681, "language_loss": 0.82149267, "learning_rate": 0.0009412265347159336, "loss": 0.83046246, "num_input_tokens_seen": 78122752, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0172378, "step": 942, "time_per_iteration": 2.7250781059265137 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0116112, "balance_loss_mlp": 1.14446664, "diversity_loss_mlp": 0.0, "epoch": 0.18141592920353983, "flos": 519282109440.0, "grad_norm": 0.08706600394859935, "language_loss": 0.84761524, "learning_rate": 0.0009410798994339829, "loss": 0.85922647, "num_input_tokens_seen": 78194064, "router_z_loss_mlp": 0.16662598, "routerloss_mlp": 0.0, "step": 943, "time_per_iteration": 2.5916900634765625 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0115721, "balance_loss_mlp": 1.14027047, "diversity_loss_mlp": 0.0, "epoch": 0.1816083108888034, "flos": 512470084608.0, "grad_norm": 0.07414862428622851, "language_loss": 0.87698966, "learning_rate": 0.000940933092909628, "loss": 0.88856173, "num_input_tokens_seen": 78262048, "router_z_loss_mlp": 0.16943359, "routerloss_mlp": 0.0, "step": 944, "time_per_iteration": 2.6747801303863525 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01166789, "balance_loss_mlp": 1.15049326, "diversity_loss_mlp": 0.0, "epoch": 0.18180069257406695, "flos": 492389715456.0, "grad_norm": 0.07390491400887403, "language_loss": 0.83424389, "learning_rate": 0.0009407861151998649, "loss": 0.84591174, "num_input_tokens_seen": 78330624, "router_z_loss_mlp": 0.16296387, "routerloss_mlp": 0.0, "step": 945, "time_per_iteration": 2.602691411972046 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01163795, "balance_loss_mlp": 1.14708209, "diversity_loss_mlp": 0.0, "epoch": 0.1819930742593305, "flos": 570158839296.0, "grad_norm": 0.07435679337016335, "language_loss": 0.86087269, "learning_rate": 0.0009406389663617552, "loss": 0.87251067, "num_input_tokens_seen": 78400672, "router_z_loss_mlp": 0.16723633, "routerloss_mlp": 0.0, "step": 946, "time_per_iteration": 2.6775379180908203 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01139209, "balance_loss_mlp": 1.12300825, "diversity_loss_mlp": 0.0, "epoch": 0.18218545594459407, "flos": 605975841792.0, "grad_norm": 0.08423780444915897, "language_loss": 0.86031067, "learning_rate": 0.000940491646452427, "loss": 0.87170279, "num_input_tokens_seen": 78467952, "router_z_loss_mlp": 0.1619873, "routerloss_mlp": 0.0, "step": 947, "time_per_iteration": 2.717313051223755 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01134537, "balance_loss_mlp": 1.11805058, "diversity_loss_mlp": 0.0, "epoch": 0.18237783762985763, "flos": 548682439680.0, "grad_norm": 0.0716601161320721, "language_loss": 0.90799212, "learning_rate": 0.000940344155529075, "loss": 0.91933751, "num_input_tokens_seen": 78538928, "router_z_loss_mlp": 0.16479492, "routerloss_mlp": 0.0, "step": 948, "time_per_iteration": 2.645601749420166 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00905236, "balance_loss_mlp": 1.57791471, "diversity_loss_mlp": 0.19691566, "epoch": 0.1825702193151212, "flos": 450741542400.0, "grad_norm": 0.03478780514937427, "language_loss": 0.87420666, "learning_rate": 0.0009401964936489605, "loss": 0.883259, "num_input_tokens_seen": 78602144, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01782099, "step": 949, "time_per_iteration": 2.546546459197998 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01132433, "balance_loss_mlp": 1.11666203, "diversity_loss_mlp": 0.0, "epoch": 0.18276260100038477, "flos": 589245871104.0, "grad_norm": 0.11218622077210595, "language_loss": 0.85308415, "learning_rate": 0.0009400486608694108, "loss": 0.86440849, "num_input_tokens_seen": 78673152, "router_z_loss_mlp": 0.15759277, "routerloss_mlp": 0.0, "step": 950, "time_per_iteration": 2.71462345123291 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135805, "balance_loss_mlp": 1.1190201, "diversity_loss_mlp": 0.0, "epoch": 0.18295498268564833, "flos": 787331653632.0, "grad_norm": 0.07143871570155125, "language_loss": 0.87176299, "learning_rate": 0.0009399006572478195, "loss": 0.88312101, "num_input_tokens_seen": 78753872, "router_z_loss_mlp": 0.16796875, "routerloss_mlp": 0.0, "step": 951, "time_per_iteration": 3.0933260917663574 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01137853, "balance_loss_mlp": 1.12129509, "diversity_loss_mlp": 0.0, "epoch": 0.1831473643709119, "flos": 578147010048.0, "grad_norm": 0.08672794105569953, "language_loss": 0.90997601, "learning_rate": 0.0009397524828416468, "loss": 0.92135453, "num_input_tokens_seen": 78822640, "router_z_loss_mlp": 0.16564941, "routerloss_mlp": 0.0, "step": 952, "time_per_iteration": 2.6721160411834717 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00906668, "balance_loss_mlp": 1.58174932, "diversity_loss_mlp": 0.19792399, "epoch": 0.18333974605617545, "flos": 566889933312.0, "grad_norm": 0.0341945315399877, "language_loss": 0.96079636, "learning_rate": 0.0009396041377084192, "loss": 0.96986312, "num_input_tokens_seen": 78893792, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01683164, "step": 953, "time_per_iteration": 2.6563429832458496 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01147916, "balance_loss_mlp": 1.1312983, "diversity_loss_mlp": 0.0, "epoch": 0.183532127741439, "flos": 526993496064.0, "grad_norm": 0.07156922543086394, "language_loss": 0.87274891, "learning_rate": 0.0009394556219057295, "loss": 0.88422805, "num_input_tokens_seen": 78964752, "router_z_loss_mlp": 0.16625977, "routerloss_mlp": 0.0, "step": 954, "time_per_iteration": 2.710129499435425 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01164762, "balance_loss_mlp": 1.1480366, "diversity_loss_mlp": 0.0, "epoch": 0.18372450942670257, "flos": 594535956480.0, "grad_norm": 0.08933499459227748, "language_loss": 0.83389091, "learning_rate": 0.0009393069354912362, "loss": 0.84553862, "num_input_tokens_seen": 79034400, "router_z_loss_mlp": 0.1673584, "routerloss_mlp": 0.0, "step": 955, "time_per_iteration": 2.736077070236206 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01162546, "balance_loss_mlp": 1.1459167, "diversity_loss_mlp": 0.0, "epoch": 0.18391689111196613, "flos": 645032014848.0, "grad_norm": 0.10088049230192819, "language_loss": 0.81851852, "learning_rate": 0.0009391580785226649, "loss": 0.83014399, "num_input_tokens_seen": 79109488, "router_z_loss_mlp": 0.16638184, "routerloss_mlp": 0.0, "step": 956, "time_per_iteration": 2.8675243854522705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01139492, "balance_loss_mlp": 1.12933517, "diversity_loss_mlp": 0.0, "epoch": 0.18410927279722972, "flos": 1457073349632.0, "grad_norm": 0.028623000900350283, "language_loss": 0.79340446, "learning_rate": 0.0009390090510578067, "loss": 0.80479944, "num_input_tokens_seen": 79327712, "router_z_loss_mlp": 0.1015625, "routerloss_mlp": 0.0, "step": 957, "time_per_iteration": 4.758531332015991 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128949, "balance_loss_mlp": 1.11177051, "diversity_loss_mlp": 0.0, "epoch": 0.18430165448249328, "flos": 658750040064.0, "grad_norm": 0.0742792603097427, "language_loss": 0.8674221, "learning_rate": 0.0009388598531545196, "loss": 0.87871158, "num_input_tokens_seen": 79401504, "router_z_loss_mlp": 0.17175293, "routerloss_mlp": 0.0, "step": 958, "time_per_iteration": 2.8665144443511963 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110151, "balance_loss_mlp": 1.09304404, "diversity_loss_mlp": 0.0, "epoch": 0.18449403616775684, "flos": 517933066752.0, "grad_norm": 0.08387101873752756, "language_loss": 0.85292655, "learning_rate": 0.000938710484870727, "loss": 0.86402804, "num_input_tokens_seen": 79466688, "router_z_loss_mlp": 0.17126465, "routerloss_mlp": 0.0, "step": 959, "time_per_iteration": 2.5621094703674316 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113798, "balance_loss_mlp": 1.09718001, "diversity_loss_mlp": 0.0, "epoch": 0.1846864178530204, "flos": 552749391360.0, "grad_norm": 0.08027143748444723, "language_loss": 0.85896957, "learning_rate": 0.0009385609462644189, "loss": 0.87010753, "num_input_tokens_seen": 79540288, "router_z_loss_mlp": 0.16625977, "routerloss_mlp": 0.0, "step": 960, "time_per_iteration": 2.6949400901794434 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01122642, "balance_loss_mlp": 1.10596502, "diversity_loss_mlp": 0.0, "epoch": 0.18487879953828396, "flos": 466166886912.0, "grad_norm": 0.07967759372686231, "language_loss": 0.8535409, "learning_rate": 0.0009384112373936514, "loss": 0.86476731, "num_input_tokens_seen": 79611872, "router_z_loss_mlp": 0.16674805, "routerloss_mlp": 0.0, "step": 961, "time_per_iteration": 2.644244432449341 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01132475, "balance_loss_mlp": 1.11566615, "diversity_loss_mlp": 0.0, "epoch": 0.18507118122354752, "flos": 648496212480.0, "grad_norm": 0.09330138113238175, "language_loss": 0.91539109, "learning_rate": 0.0009382613583165467, "loss": 0.92671585, "num_input_tokens_seen": 79689504, "router_z_loss_mlp": 0.16821289, "routerloss_mlp": 0.0, "step": 962, "time_per_iteration": 2.8191375732421875 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128481, "balance_loss_mlp": 1.11161256, "diversity_loss_mlp": 0.0, "epoch": 0.18526356290881107, "flos": 626772764160.0, "grad_norm": 0.08799115365988901, "language_loss": 0.89600122, "learning_rate": 0.0009381113090912928, "loss": 0.90728599, "num_input_tokens_seen": 79759264, "router_z_loss_mlp": 0.16882324, "routerloss_mlp": 0.0, "step": 963, "time_per_iteration": 2.77341890335083 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01137775, "balance_loss_mlp": 1.12159812, "diversity_loss_mlp": 0.0, "epoch": 0.18545594459407463, "flos": 432726769152.0, "grad_norm": 0.08224545608030313, "language_loss": 0.89354098, "learning_rate": 0.000937961089776144, "loss": 0.90491867, "num_input_tokens_seen": 79824464, "router_z_loss_mlp": 0.16174316, "routerloss_mlp": 0.0, "step": 964, "time_per_iteration": 2.6057045459747314 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01140677, "balance_loss_mlp": 1.12448788, "diversity_loss_mlp": 0.0, "epoch": 0.1856483262793382, "flos": 749061043200.0, "grad_norm": 0.08763662153745684, "language_loss": 0.82399738, "learning_rate": 0.0009378107004294208, "loss": 0.83540416, "num_input_tokens_seen": 79907152, "router_z_loss_mlp": 0.16186523, "routerloss_mlp": 0.0, "step": 965, "time_per_iteration": 2.9792187213897705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01132665, "balance_loss_mlp": 1.11624968, "diversity_loss_mlp": 0.0, "epoch": 0.18584070796460178, "flos": 530326642176.0, "grad_norm": 0.0696996408734829, "language_loss": 0.91584361, "learning_rate": 0.0009376601411095096, "loss": 0.92717028, "num_input_tokens_seen": 79976944, "router_z_loss_mlp": 0.16418457, "routerloss_mlp": 0.0, "step": 966, "time_per_iteration": 2.6557700634002686 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108368, "balance_loss_mlp": 1.09209585, "diversity_loss_mlp": 0.0, "epoch": 0.18603308964986534, "flos": 483106830336.0, "grad_norm": 0.0928645758984953, "language_loss": 0.86438054, "learning_rate": 0.0009375094118748622, "loss": 0.8754642, "num_input_tokens_seen": 80042112, "router_z_loss_mlp": 0.16271973, "routerloss_mlp": 0.0, "step": 967, "time_per_iteration": 2.5574727058410645 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121341, "balance_loss_mlp": 1.10546279, "diversity_loss_mlp": 0.0, "epoch": 0.1862254713351289, "flos": 801316551168.0, "grad_norm": 0.08866997131388626, "language_loss": 0.90710455, "learning_rate": 0.0009373585127839976, "loss": 0.91831791, "num_input_tokens_seen": 80118896, "router_z_loss_mlp": 0.15869141, "routerloss_mlp": 0.0, "step": 968, "time_per_iteration": 2.9949731826782227 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01122311, "balance_loss_mlp": 1.1066587, "diversity_loss_mlp": 0.0, "epoch": 0.18641785302039246, "flos": 478323325440.0, "grad_norm": 0.08663719992470821, "language_loss": 0.90892541, "learning_rate": 0.0009372074438954994, "loss": 0.92014849, "num_input_tokens_seen": 80183360, "router_z_loss_mlp": 0.15637207, "routerloss_mlp": 0.0, "step": 969, "time_per_iteration": 2.583392381668091 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115205, "balance_loss_mlp": 1.09983897, "diversity_loss_mlp": 0.0, "epoch": 0.18661023470565602, "flos": 388911684096.0, "grad_norm": 0.1288159292638968, "language_loss": 0.91714692, "learning_rate": 0.0009370562052680181, "loss": 0.92829901, "num_input_tokens_seen": 80247024, "router_z_loss_mlp": 0.15356445, "routerloss_mlp": 0.0, "step": 970, "time_per_iteration": 2.476053476333618 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01131477, "balance_loss_mlp": 1.1160872, "diversity_loss_mlp": 0.0, "epoch": 0.18680261639091958, "flos": 564676033536.0, "grad_norm": 0.05501755081279848, "language_loss": 0.89296091, "learning_rate": 0.0009369047969602695, "loss": 0.90427566, "num_input_tokens_seen": 80318256, "router_z_loss_mlp": 0.15368652, "routerloss_mlp": 0.0, "step": 971, "time_per_iteration": 2.705310344696045 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01161734, "balance_loss_mlp": 1.14604628, "diversity_loss_mlp": 0.0, "epoch": 0.18699499807618314, "flos": 479259763200.0, "grad_norm": 0.09590230746039986, "language_loss": 0.86690193, "learning_rate": 0.0009367532190310357, "loss": 0.8785193, "num_input_tokens_seen": 80384848, "router_z_loss_mlp": 0.15673828, "routerloss_mlp": 0.0, "step": 972, "time_per_iteration": 2.551683187484741 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01151378, "balance_loss_mlp": 1.13526106, "diversity_loss_mlp": 0.0, "epoch": 0.1871873797614467, "flos": 553283136000.0, "grad_norm": 0.13723256450586457, "language_loss": 0.88859725, "learning_rate": 0.0009366014715391644, "loss": 0.90011096, "num_input_tokens_seen": 80453088, "router_z_loss_mlp": 0.16113281, "routerloss_mlp": 0.0, "step": 973, "time_per_iteration": 2.6311707496643066 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01140536, "balance_loss_mlp": 1.12521768, "diversity_loss_mlp": 0.0, "epoch": 0.18737976144671029, "flos": 552811060224.0, "grad_norm": 0.0667022200872989, "language_loss": 0.83902818, "learning_rate": 0.0009364495545435693, "loss": 0.85043353, "num_input_tokens_seen": 80528608, "router_z_loss_mlp": 0.15307617, "routerloss_mlp": 0.0, "step": 974, "time_per_iteration": 2.756056308746338 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121055, "balance_loss_mlp": 1.10528326, "diversity_loss_mlp": 0.0, "epoch": 0.18757214313197385, "flos": 502250761728.0, "grad_norm": 0.06720472395514528, "language_loss": 0.88235438, "learning_rate": 0.0009362974681032297, "loss": 0.89356488, "num_input_tokens_seen": 80599600, "router_z_loss_mlp": 0.15759277, "routerloss_mlp": 0.0, "step": 975, "time_per_iteration": 2.601027488708496 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117499, "balance_loss_mlp": 1.10179889, "diversity_loss_mlp": 0.0, "epoch": 0.1877645248172374, "flos": 675010506240.0, "grad_norm": 0.09372829562862567, "language_loss": 0.88529336, "learning_rate": 0.0009361452122771907, "loss": 0.8964684, "num_input_tokens_seen": 80677264, "router_z_loss_mlp": 0.15698242, "routerloss_mlp": 0.0, "step": 976, "time_per_iteration": 2.8729074001312256 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124468, "balance_loss_mlp": 1.107934, "diversity_loss_mlp": 0.0, "epoch": 0.18795690650250096, "flos": 404989341696.0, "grad_norm": 0.10248565336705484, "language_loss": 0.83506191, "learning_rate": 0.0009359927871245635, "loss": 0.84630656, "num_input_tokens_seen": 80739776, "router_z_loss_mlp": 0.16540527, "routerloss_mlp": 0.0, "step": 977, "time_per_iteration": 2.4633541107177734 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0114403, "balance_loss_mlp": 1.12861657, "diversity_loss_mlp": 0.0, "epoch": 0.18814928818776452, "flos": 637891448832.0, "grad_norm": 0.09207140211488826, "language_loss": 0.85937703, "learning_rate": 0.0009358401927045246, "loss": 0.87081736, "num_input_tokens_seen": 80815200, "router_z_loss_mlp": 0.15393066, "routerloss_mlp": 0.0, "step": 978, "time_per_iteration": 2.8528451919555664 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01165656, "balance_loss_mlp": 1.15002799, "diversity_loss_mlp": 0.0, "epoch": 0.18834166987302808, "flos": 1138282191360.0, "grad_norm": 0.09819064259764942, "language_loss": 0.88151729, "learning_rate": 0.0009356874290763166, "loss": 0.89317381, "num_input_tokens_seen": 80905024, "router_z_loss_mlp": 0.15625, "routerloss_mlp": 0.0, "step": 979, "time_per_iteration": 3.4732589721679688 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01165217, "balance_loss_mlp": 1.14985144, "diversity_loss_mlp": 0.0, "epoch": 0.18853405155829164, "flos": 504793202688.0, "grad_norm": 0.07125364842819645, "language_loss": 0.88739443, "learning_rate": 0.0009355344962992474, "loss": 0.8990466, "num_input_tokens_seen": 80976704, "router_z_loss_mlp": 0.15344238, "routerloss_mlp": 0.0, "step": 980, "time_per_iteration": 2.618013381958008 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0092711, "balance_loss_mlp": 1.61735535, "diversity_loss_mlp": 0.20325859, "epoch": 0.1887264332435552, "flos": 608177258496.0, "grad_norm": 0.031158428526317693, "language_loss": 0.8787328, "learning_rate": 0.0009353813944326908, "loss": 0.88800395, "num_input_tokens_seen": 81057152, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0168031, "step": 981, "time_per_iteration": 2.926612377166748 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00925726, "balance_loss_mlp": 1.616956, "diversity_loss_mlp": 0.20126666, "epoch": 0.1889188149288188, "flos": 552529506816.0, "grad_norm": 0.0354798675553145, "language_loss": 0.82752389, "learning_rate": 0.0009352281235360863, "loss": 0.83678114, "num_input_tokens_seen": 81131520, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01661466, "step": 982, "time_per_iteration": 2.7461719512939453 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01156754, "balance_loss_mlp": 1.14193642, "diversity_loss_mlp": 0.0, "epoch": 0.18911119661408235, "flos": 418559063040.0, "grad_norm": 0.08008026175511872, "language_loss": 0.84875655, "learning_rate": 0.0009350746836689389, "loss": 0.86032403, "num_input_tokens_seen": 81195952, "router_z_loss_mlp": 0.14794922, "routerloss_mlp": 0.0, "step": 983, "time_per_iteration": 2.5128703117370605 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01232965, "balance_loss_mlp": 1.22199774, "diversity_loss_mlp": 0.0, "epoch": 0.1893035782993459, "flos": 1481974299648.0, "grad_norm": 0.06420942239022731, "language_loss": 0.81439221, "learning_rate": 0.0009349210748908193, "loss": 0.82672185, "num_input_tokens_seen": 81427312, "router_z_loss_mlp": 0.10986328, "routerloss_mlp": 0.0, "step": 984, "time_per_iteration": 4.987680196762085 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01144146, "balance_loss_mlp": 1.12880325, "diversity_loss_mlp": 0.0, "epoch": 0.18949595998460947, "flos": 508467373056.0, "grad_norm": 0.08702988523082197, "language_loss": 0.82654107, "learning_rate": 0.0009347672972613634, "loss": 0.83798254, "num_input_tokens_seen": 81494256, "router_z_loss_mlp": 0.15319824, "routerloss_mlp": 0.0, "step": 985, "time_per_iteration": 2.586580514907837 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00891878, "balance_loss_mlp": 1.54986262, "diversity_loss_mlp": 0.20135348, "epoch": 0.18968834166987303, "flos": 531087611904.0, "grad_norm": 0.032521151954013804, "language_loss": 0.85226321, "learning_rate": 0.0009346133508402735, "loss": 0.86118197, "num_input_tokens_seen": 81569312, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01626948, "step": 986, "time_per_iteration": 2.7389352321624756 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01151414, "balance_loss_mlp": 1.13596404, "diversity_loss_mlp": 0.0, "epoch": 0.1898807233551366, "flos": 499762649088.0, "grad_norm": 0.0982536864932062, "language_loss": 0.84267235, "learning_rate": 0.0009344592356873166, "loss": 0.85418648, "num_input_tokens_seen": 81637024, "router_z_loss_mlp": 0.15429688, "routerloss_mlp": 0.0, "step": 987, "time_per_iteration": 2.6327145099639893 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01157169, "balance_loss_mlp": 1.14155281, "diversity_loss_mlp": 0.0, "epoch": 0.19007310504040015, "flos": 602220178944.0, "grad_norm": 0.07528447862042392, "language_loss": 0.78532755, "learning_rate": 0.0009343049518623255, "loss": 0.79689926, "num_input_tokens_seen": 81709488, "router_z_loss_mlp": 0.15600586, "routerloss_mlp": 0.0, "step": 988, "time_per_iteration": 2.7461259365081787 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01161817, "balance_loss_mlp": 1.14693928, "diversity_loss_mlp": 0.0, "epoch": 0.1902654867256637, "flos": 601651929600.0, "grad_norm": 0.07061488940634471, "language_loss": 0.83142781, "learning_rate": 0.0009341504994251985, "loss": 0.84304595, "num_input_tokens_seen": 81787152, "router_z_loss_mlp": 0.14855957, "routerloss_mlp": 0.0, "step": 989, "time_per_iteration": 2.9033045768737793 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128003, "balance_loss_mlp": 1.11765516, "diversity_loss_mlp": 0.0, "epoch": 0.19045786841092727, "flos": 1575784005120.0, "grad_norm": 0.02664126889468688, "language_loss": 0.73520499, "learning_rate": 0.0009339958784358994, "loss": 0.74648499, "num_input_tokens_seen": 82030608, "router_z_loss_mlp": 0.10351562, "routerloss_mlp": 0.0, "step": 990, "time_per_iteration": 5.065544605255127 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0116372, "balance_loss_mlp": 1.14821064, "diversity_loss_mlp": 0.0, "epoch": 0.19065025009619085, "flos": 681634579968.0, "grad_norm": 0.062492069067547173, "language_loss": 0.81668103, "learning_rate": 0.0009338410889544574, "loss": 0.82831824, "num_input_tokens_seen": 82119872, "router_z_loss_mlp": 0.15490723, "routerloss_mlp": 0.0, "step": 991, "time_per_iteration": 3.0360453128814697 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01160077, "balance_loss_mlp": 1.14444828, "diversity_loss_mlp": 0.0, "epoch": 0.1908426317814544, "flos": 602264595456.0, "grad_norm": 0.07188646642614673, "language_loss": 0.87598348, "learning_rate": 0.000933686131040967, "loss": 0.88758421, "num_input_tokens_seen": 82195552, "router_z_loss_mlp": 0.15612793, "routerloss_mlp": 0.0, "step": 992, "time_per_iteration": 4.194309234619141 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01132508, "balance_loss_mlp": 1.11693931, "diversity_loss_mlp": 0.0, "epoch": 0.19103501346671797, "flos": 586308077568.0, "grad_norm": 0.07096950165415856, "language_loss": 0.90250611, "learning_rate": 0.0009335310047555883, "loss": 0.91383117, "num_input_tokens_seen": 82267040, "router_z_loss_mlp": 0.15551758, "routerloss_mlp": 0.0, "step": 993, "time_per_iteration": 2.7198565006256104 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128004, "balance_loss_mlp": 1.11225605, "diversity_loss_mlp": 0.0, "epoch": 0.19122739515198153, "flos": 545761898496.0, "grad_norm": 0.07682750770192658, "language_loss": 0.8836562, "learning_rate": 0.0009333757101585467, "loss": 0.89493626, "num_input_tokens_seen": 82337680, "router_z_loss_mlp": 0.15734863, "routerloss_mlp": 0.0, "step": 994, "time_per_iteration": 2.6651480197906494 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121887, "balance_loss_mlp": 1.10621142, "diversity_loss_mlp": 0.0, "epoch": 0.1914197768372451, "flos": 521446450176.0, "grad_norm": 0.10461680978710068, "language_loss": 0.9317944, "learning_rate": 0.0009332202473101329, "loss": 0.94301325, "num_input_tokens_seen": 82409600, "router_z_loss_mlp": 0.15673828, "routerloss_mlp": 0.0, "step": 995, "time_per_iteration": 2.667943239212036 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00890685, "balance_loss_mlp": 1.54595685, "diversity_loss_mlp": 0.2013846, "epoch": 0.19161215852250865, "flos": 611246103552.0, "grad_norm": 0.03439253799161941, "language_loss": 0.8270663, "learning_rate": 0.0009330646162707028, "loss": 0.83597314, "num_input_tokens_seen": 82480288, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0170145, "step": 996, "time_per_iteration": 2.7859413623809814 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130524, "balance_loss_mlp": 1.11483645, "diversity_loss_mlp": 0.0, "epoch": 0.1918045402077722, "flos": 846660916224.0, "grad_norm": 0.07379991060729872, "language_loss": 0.84002179, "learning_rate": 0.0009329088171006779, "loss": 0.85132706, "num_input_tokens_seen": 82568960, "router_z_loss_mlp": 0.15673828, "routerloss_mlp": 0.0, "step": 997, "time_per_iteration": 3.133023738861084 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136353, "balance_loss_mlp": 1.12061739, "diversity_loss_mlp": 0.0, "epoch": 0.19199692189303577, "flos": 465937090560.0, "grad_norm": 0.09187105070084006, "language_loss": 0.85599297, "learning_rate": 0.0009327528498605446, "loss": 0.86735654, "num_input_tokens_seen": 82634128, "router_z_loss_mlp": 0.15722656, "routerloss_mlp": 0.0, "step": 998, "time_per_iteration": 2.5390877723693848 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00888942, "balance_loss_mlp": 1.54108667, "diversity_loss_mlp": 0.20404731, "epoch": 0.19218930357829936, "flos": 531576940032.0, "grad_norm": 0.03685920036749298, "language_loss": 0.89166534, "learning_rate": 0.0009325967146108548, "loss": 0.90055484, "num_input_tokens_seen": 82707472, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01637482, "step": 999, "time_per_iteration": 2.7167420387268066 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01159789, "balance_loss_mlp": 1.14361215, "diversity_loss_mlp": 0.0, "epoch": 0.19238168526356292, "flos": 601624765440.0, "grad_norm": 0.08415694153473897, "language_loss": 0.87386107, "learning_rate": 0.0009324404114122258, "loss": 0.88545901, "num_input_tokens_seen": 82775232, "router_z_loss_mlp": 0.16174316, "routerloss_mlp": 0.0, "step": 1000, "time_per_iteration": 2.6833291053771973 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01164843, "balance_loss_mlp": 1.1492269, "diversity_loss_mlp": 0.0, "epoch": 0.19257406694882648, "flos": 571982155776.0, "grad_norm": 0.07516183221332183, "language_loss": 0.86446774, "learning_rate": 0.0009322839403253397, "loss": 0.87611622, "num_input_tokens_seen": 82850032, "router_z_loss_mlp": 0.15612793, "routerloss_mlp": 0.0, "step": 1001, "time_per_iteration": 4.16480565071106 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01173642, "balance_loss_mlp": 1.15789402, "diversity_loss_mlp": 0.0, "epoch": 0.19276644863409004, "flos": 801813219840.0, "grad_norm": 0.07739515949456567, "language_loss": 0.84035075, "learning_rate": 0.0009321273014109439, "loss": 0.8520872, "num_input_tokens_seen": 82926080, "router_z_loss_mlp": 0.15734863, "routerloss_mlp": 0.0, "step": 1002, "time_per_iteration": 2.9390604496002197 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01183539, "balance_loss_mlp": 1.16795826, "diversity_loss_mlp": 0.0, "epoch": 0.1929588303193536, "flos": 563314507776.0, "grad_norm": 0.08102605487142737, "language_loss": 0.84643984, "learning_rate": 0.0009319704947298513, "loss": 0.85827518, "num_input_tokens_seen": 83005200, "router_z_loss_mlp": 0.15576172, "routerloss_mlp": 0.0, "step": 1003, "time_per_iteration": 2.923952579498291 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0116012, "balance_loss_mlp": 1.14496815, "diversity_loss_mlp": 0.0, "epoch": 0.19315121200461716, "flos": 626837004288.0, "grad_norm": 0.060771133612280225, "language_loss": 0.88448775, "learning_rate": 0.0009318135203429393, "loss": 0.89608896, "num_input_tokens_seen": 83077280, "router_z_loss_mlp": 0.15124512, "routerloss_mlp": 0.0, "step": 1004, "time_per_iteration": 2.7170984745025635 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135222, "balance_loss_mlp": 1.11972475, "diversity_loss_mlp": 0.0, "epoch": 0.19334359368988072, "flos": 517451079168.0, "grad_norm": 0.07023398647530335, "language_loss": 0.87528408, "learning_rate": 0.0009316563783111511, "loss": 0.88663626, "num_input_tokens_seen": 83145456, "router_z_loss_mlp": 0.15490723, "routerloss_mlp": 0.0, "step": 1005, "time_per_iteration": 2.7271320819854736 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.011162, "balance_loss_mlp": 1.10061884, "diversity_loss_mlp": 0.0, "epoch": 0.19353597537514428, "flos": 694080285696.0, "grad_norm": 0.07388032809600253, "language_loss": 0.82009041, "learning_rate": 0.0009314990686954943, "loss": 0.83125246, "num_input_tokens_seen": 83225392, "router_z_loss_mlp": 0.15576172, "routerloss_mlp": 0.0, "step": 1006, "time_per_iteration": 2.9210305213928223 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108745, "balance_loss_mlp": 1.09337938, "diversity_loss_mlp": 0.0, "epoch": 0.19372835706040784, "flos": 1210170585600.0, "grad_norm": 0.06330578200459082, "language_loss": 0.80805916, "learning_rate": 0.000931341591557042, "loss": 0.81914663, "num_input_tokens_seen": 83331296, "router_z_loss_mlp": 0.15344238, "routerloss_mlp": 0.0, "step": 1007, "time_per_iteration": 3.695157051086426 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095993, "balance_loss_mlp": 1.08054364, "diversity_loss_mlp": 0.0, "epoch": 0.19392073874567142, "flos": 520631152128.0, "grad_norm": 0.07858263731415134, "language_loss": 0.87216473, "learning_rate": 0.0009311839469569325, "loss": 0.88312465, "num_input_tokens_seen": 83399952, "router_z_loss_mlp": 0.15441895, "routerloss_mlp": 0.0, "step": 1008, "time_per_iteration": 2.633854389190674 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108854, "balance_loss_mlp": 1.07287586, "diversity_loss_mlp": 0.0, "epoch": 0.19411312043093498, "flos": 588816013824.0, "grad_norm": 0.14235975733457876, "language_loss": 0.87399781, "learning_rate": 0.0009310261349563687, "loss": 0.88488322, "num_input_tokens_seen": 83468384, "router_z_loss_mlp": 0.15649414, "routerloss_mlp": 0.0, "step": 1009, "time_per_iteration": 2.702073574066162 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00898627, "balance_loss_mlp": 1.56164169, "diversity_loss_mlp": 0.20371187, "epoch": 0.19430550211619854, "flos": 579382253568.0, "grad_norm": 0.03011805945399338, "language_loss": 0.85438645, "learning_rate": 0.0009308681556166186, "loss": 0.86337274, "num_input_tokens_seen": 83547952, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01594995, "step": 1010, "time_per_iteration": 2.8698601722717285 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111744, "balance_loss_mlp": 1.0962348, "diversity_loss_mlp": 0.0, "epoch": 0.1944978838014621, "flos": 621126973440.0, "grad_norm": 0.08879322612819535, "language_loss": 0.87462533, "learning_rate": 0.0009307100089990152, "loss": 0.88574278, "num_input_tokens_seen": 83615712, "router_z_loss_mlp": 0.15490723, "routerloss_mlp": 0.0, "step": 1011, "time_per_iteration": 2.7149901390075684 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01140864, "balance_loss_mlp": 1.12543821, "diversity_loss_mlp": 0.0, "epoch": 0.19469026548672566, "flos": 598714136064.0, "grad_norm": 0.07383907155719892, "language_loss": 0.83837229, "learning_rate": 0.0009305516951649568, "loss": 0.84978092, "num_input_tokens_seen": 83687296, "router_z_loss_mlp": 0.15405273, "routerloss_mlp": 0.0, "step": 1012, "time_per_iteration": 2.702683448791504 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01161407, "balance_loss_mlp": 1.14599323, "diversity_loss_mlp": 0.0, "epoch": 0.19488264717198922, "flos": 552161318400.0, "grad_norm": 0.07624018834593461, "language_loss": 0.86570859, "learning_rate": 0.0009303932141759057, "loss": 0.87732267, "num_input_tokens_seen": 83763168, "router_z_loss_mlp": 0.15393066, "routerloss_mlp": 0.0, "step": 1013, "time_per_iteration": 2.7500197887420654 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01168747, "balance_loss_mlp": 1.15382242, "diversity_loss_mlp": 0.0, "epoch": 0.19507502885725278, "flos": 666135456768.0, "grad_norm": 0.08469076174706892, "language_loss": 0.83575755, "learning_rate": 0.0009302345660933902, "loss": 0.84744501, "num_input_tokens_seen": 83837312, "router_z_loss_mlp": 0.14892578, "routerloss_mlp": 0.0, "step": 1014, "time_per_iteration": 2.8010780811309814 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01171185, "balance_loss_mlp": 1.15642715, "diversity_loss_mlp": 0.0, "epoch": 0.19526741054251634, "flos": 671081946624.0, "grad_norm": 0.08619273283705803, "language_loss": 0.85146868, "learning_rate": 0.0009300757509790026, "loss": 0.86318052, "num_input_tokens_seen": 83917120, "router_z_loss_mlp": 0.14746094, "routerloss_mlp": 0.0, "step": 1015, "time_per_iteration": 2.840315103530884 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01150596, "balance_loss_mlp": 1.13570654, "diversity_loss_mlp": 0.0, "epoch": 0.19545979222777993, "flos": 447215675904.0, "grad_norm": 0.10655365126946059, "language_loss": 0.90244913, "learning_rate": 0.0009299167688944005, "loss": 0.91395509, "num_input_tokens_seen": 83982992, "router_z_loss_mlp": 0.14855957, "routerloss_mlp": 0.0, "step": 1016, "time_per_iteration": 2.502391815185547 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130549, "balance_loss_mlp": 1.11540985, "diversity_loss_mlp": 0.0, "epoch": 0.1956521739130435, "flos": 569084009472.0, "grad_norm": 0.07757202619564983, "language_loss": 0.85754222, "learning_rate": 0.0009297576199013063, "loss": 0.86884773, "num_input_tokens_seen": 84057296, "router_z_loss_mlp": 0.15112305, "routerloss_mlp": 0.0, "step": 1017, "time_per_iteration": 2.7255496978759766 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00657481, "balance_loss_mlp": 1.1064117, "diversity_loss_mlp": 0.17609364, "epoch": 0.19584455559830705, "flos": 1455749273088.0, "grad_norm": 0.0027779106975556575, "language_loss": 0.73002136, "learning_rate": 0.0009295983040615071, "loss": 0.73659611, "num_input_tokens_seen": 84292640, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01622855, "step": 1018, "time_per_iteration": 4.943171739578247 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01384914, "balance_loss_mlp": 1.37351775, "diversity_loss_mlp": 0.0, "epoch": 0.1960369372835706, "flos": 1591150252032.0, "grad_norm": 0.09054623740471555, "language_loss": 0.79426301, "learning_rate": 0.0009294388214368547, "loss": 0.80811214, "num_input_tokens_seen": 84524448, "router_z_loss_mlp": 0.11376953, "routerloss_mlp": 0.0, "step": 1019, "time_per_iteration": 5.518418788909912 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01125322, "balance_loss_mlp": 1.11074281, "diversity_loss_mlp": 0.0, "epoch": 0.19622931896883417, "flos": 616017125376.0, "grad_norm": 0.08202201534603108, "language_loss": 0.8648417, "learning_rate": 0.0009292791720892659, "loss": 0.87609494, "num_input_tokens_seen": 84600208, "router_z_loss_mlp": 0.14562988, "routerloss_mlp": 0.0, "step": 1020, "time_per_iteration": 2.889078140258789 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01131715, "balance_loss_mlp": 1.11721921, "diversity_loss_mlp": 0.0, "epoch": 0.19642170065409773, "flos": 466201391616.0, "grad_norm": 0.07932574612707302, "language_loss": 0.88913518, "learning_rate": 0.0009291193560807218, "loss": 0.90045238, "num_input_tokens_seen": 84668032, "router_z_loss_mlp": 0.14477539, "routerloss_mlp": 0.0, "step": 1021, "time_per_iteration": 2.5933609008789062 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136679, "balance_loss_mlp": 1.122159, "diversity_loss_mlp": 0.0, "epoch": 0.19661408233936128, "flos": 515289309696.0, "grad_norm": 0.08278255048112054, "language_loss": 0.87034905, "learning_rate": 0.0009289593734732688, "loss": 0.88171583, "num_input_tokens_seen": 84738176, "router_z_loss_mlp": 0.14501953, "routerloss_mlp": 0.0, "step": 1022, "time_per_iteration": 2.600834369659424 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01132774, "balance_loss_mlp": 1.11842132, "diversity_loss_mlp": 0.0, "epoch": 0.19680646402462484, "flos": 392640182784.0, "grad_norm": 0.08270608551386573, "language_loss": 0.93774927, "learning_rate": 0.0009287992243290175, "loss": 0.94907701, "num_input_tokens_seen": 84799936, "router_z_loss_mlp": 0.14355469, "routerloss_mlp": 0.0, "step": 1023, "time_per_iteration": 2.474914312362671 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111783, "balance_loss_mlp": 1.10275006, "diversity_loss_mlp": 0.0, "epoch": 0.19699884570988843, "flos": 626421828096.0, "grad_norm": 0.06901830196983176, "language_loss": 0.90473127, "learning_rate": 0.0009286389087101435, "loss": 0.91590953, "num_input_tokens_seen": 84877216, "router_z_loss_mlp": 0.15063477, "routerloss_mlp": 0.0, "step": 1024, "time_per_iteration": 2.7718465328216553 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01120328, "balance_loss_mlp": 1.1055932, "diversity_loss_mlp": 0.0, "epoch": 0.197191227395152, "flos": 557982577152.0, "grad_norm": 0.07476522676232629, "language_loss": 0.8853035, "learning_rate": 0.0009284784266788864, "loss": 0.89650679, "num_input_tokens_seen": 84952464, "router_z_loss_mlp": 0.14697266, "routerloss_mlp": 0.0, "step": 1025, "time_per_iteration": 2.7143290042877197 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01122071, "balance_loss_mlp": 1.10795665, "diversity_loss_mlp": 0.0, "epoch": 0.19738360908041555, "flos": 664993815552.0, "grad_norm": 0.08990804702262417, "language_loss": 0.91984832, "learning_rate": 0.0009283177782975512, "loss": 0.93106908, "num_input_tokens_seen": 85031488, "router_z_loss_mlp": 0.14111328, "routerloss_mlp": 0.0, "step": 1026, "time_per_iteration": 2.948909282684326 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115739, "balance_loss_mlp": 1.10118401, "diversity_loss_mlp": 0.0, "epoch": 0.1975759907656791, "flos": 522496687104.0, "grad_norm": 0.08229992096701991, "language_loss": 0.88074464, "learning_rate": 0.000928156963628507, "loss": 0.89190209, "num_input_tokens_seen": 85098384, "router_z_loss_mlp": 0.14526367, "routerloss_mlp": 0.0, "step": 1027, "time_per_iteration": 2.5764074325561523 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01109846, "balance_loss_mlp": 1.09483802, "diversity_loss_mlp": 0.0, "epoch": 0.19776837245094267, "flos": 462482804736.0, "grad_norm": 0.08379460495492784, "language_loss": 0.87978798, "learning_rate": 0.0009279959827341877, "loss": 0.89088643, "num_input_tokens_seen": 85172944, "router_z_loss_mlp": 0.14990234, "routerloss_mlp": 0.0, "step": 1028, "time_per_iteration": 2.752347946166992 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095635, "balance_loss_mlp": 1.08043635, "diversity_loss_mlp": 0.0, "epoch": 0.19796075413620623, "flos": 503058719232.0, "grad_norm": 0.08467225305095022, "language_loss": 0.87624389, "learning_rate": 0.0009278348356770915, "loss": 0.88720024, "num_input_tokens_seen": 85241632, "router_z_loss_mlp": 0.15185547, "routerloss_mlp": 0.0, "step": 1029, "time_per_iteration": 2.555527687072754 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01096968, "balance_loss_mlp": 1.08132768, "diversity_loss_mlp": 0.0, "epoch": 0.1981531358214698, "flos": 507538275840.0, "grad_norm": 0.0755245964113765, "language_loss": 0.85285002, "learning_rate": 0.0009276735225197814, "loss": 0.86381966, "num_input_tokens_seen": 85308992, "router_z_loss_mlp": 0.15625, "routerloss_mlp": 0.0, "step": 1030, "time_per_iteration": 2.5947089195251465 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01104122, "balance_loss_mlp": 1.08832633, "diversity_loss_mlp": 0.0, "epoch": 0.19834551750673335, "flos": 531547204608.0, "grad_norm": 0.08972056860523267, "language_loss": 0.85732102, "learning_rate": 0.0009275120433248847, "loss": 0.86836231, "num_input_tokens_seen": 85381936, "router_z_loss_mlp": 0.15783691, "routerloss_mlp": 0.0, "step": 1031, "time_per_iteration": 2.676872730255127 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01109193, "balance_loss_mlp": 1.09355247, "diversity_loss_mlp": 0.0, "epoch": 0.1985378991919969, "flos": 775511096832.0, "grad_norm": 0.07488561277584621, "language_loss": 0.85529125, "learning_rate": 0.0009273503981550931, "loss": 0.86638314, "num_input_tokens_seen": 85474352, "router_z_loss_mlp": 0.15625, "routerloss_mlp": 0.0, "step": 1032, "time_per_iteration": 3.09958815574646 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099668, "balance_loss_mlp": 1.08494592, "diversity_loss_mlp": 0.0, "epoch": 0.1987302808772605, "flos": 434288355840.0, "grad_norm": 0.1040963884260124, "language_loss": 0.86882496, "learning_rate": 0.0009271885870731626, "loss": 0.87982166, "num_input_tokens_seen": 85538416, "router_z_loss_mlp": 0.14697266, "routerloss_mlp": 0.0, "step": 1033, "time_per_iteration": 2.509047269821167 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098009, "balance_loss_mlp": 1.08258307, "diversity_loss_mlp": 0.0, "epoch": 0.19892266256252406, "flos": 553604336640.0, "grad_norm": 0.09324111295027285, "language_loss": 0.88376671, "learning_rate": 0.0009270266101419143, "loss": 0.89474678, "num_input_tokens_seen": 85604416, "router_z_loss_mlp": 0.1541748, "routerloss_mlp": 0.0, "step": 1034, "time_per_iteration": 2.6504034996032715 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094236, "balance_loss_mlp": 1.07954955, "diversity_loss_mlp": 0.0, "epoch": 0.19911504424778761, "flos": 549865926144.0, "grad_norm": 0.12545708784893086, "language_loss": 0.85201651, "learning_rate": 0.0009268644674242328, "loss": 0.86295891, "num_input_tokens_seen": 85677008, "router_z_loss_mlp": 0.14672852, "routerloss_mlp": 0.0, "step": 1035, "time_per_iteration": 2.6919047832489014 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105423, "balance_loss_mlp": 1.08997381, "diversity_loss_mlp": 0.0, "epoch": 0.19930742593305117, "flos": 518281431552.0, "grad_norm": 0.09055239952020887, "language_loss": 0.80814689, "learning_rate": 0.0009267021589830678, "loss": 0.81920111, "num_input_tokens_seen": 85745200, "router_z_loss_mlp": 0.15429688, "routerloss_mlp": 0.0, "step": 1036, "time_per_iteration": 2.582871198654175 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01278291, "balance_loss_mlp": 1.26927888, "diversity_loss_mlp": 0.0, "epoch": 0.19949980761831473, "flos": 1509338769408.0, "grad_norm": 0.10087907784966592, "language_loss": 0.77627081, "learning_rate": 0.0009265396848814328, "loss": 0.78905374, "num_input_tokens_seen": 85980608, "router_z_loss_mlp": 0.09033203, "routerloss_mlp": 0.0, "step": 1037, "time_per_iteration": 4.955699920654297 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112922, "balance_loss_mlp": 1.11371088, "diversity_loss_mlp": 0.0, "epoch": 0.1996921893035783, "flos": 698129985024.0, "grad_norm": 0.08737337363848705, "language_loss": 0.9264009, "learning_rate": 0.000926377045182406, "loss": 0.93769312, "num_input_tokens_seen": 86055952, "router_z_loss_mlp": 0.15490723, "routerloss_mlp": 0.0, "step": 1038, "time_per_iteration": 2.8884389400482178 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01140143, "balance_loss_mlp": 1.12453878, "diversity_loss_mlp": 0.0, "epoch": 0.19988457098884185, "flos": 727023734784.0, "grad_norm": 0.10415849564176528, "language_loss": 0.87916917, "learning_rate": 0.0009262142399491296, "loss": 0.89057058, "num_input_tokens_seen": 86145536, "router_z_loss_mlp": 0.15588379, "routerloss_mlp": 0.0, "step": 1039, "time_per_iteration": 3.045872211456299 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01143419, "balance_loss_mlp": 1.12763548, "diversity_loss_mlp": 0.0, "epoch": 0.2000769526741054, "flos": 560544841728.0, "grad_norm": 0.09906225236156592, "language_loss": 0.87455821, "learning_rate": 0.0009260512692448105, "loss": 0.88599241, "num_input_tokens_seen": 86214480, "router_z_loss_mlp": 0.15771484, "routerloss_mlp": 0.0, "step": 1040, "time_per_iteration": 2.699052572250366 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124613, "balance_loss_mlp": 1.10879421, "diversity_loss_mlp": 0.0, "epoch": 0.200269334359369, "flos": 572039055360.0, "grad_norm": 0.0911420547130344, "language_loss": 0.8431657, "learning_rate": 0.000925888133132719, "loss": 0.85441184, "num_input_tokens_seen": 86289824, "router_z_loss_mlp": 0.15808105, "routerloss_mlp": 0.0, "step": 1041, "time_per_iteration": 2.780141830444336 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063145, "balance_loss_mlp": 1.05260694, "diversity_loss_mlp": 0.0, "epoch": 0.20046171604463256, "flos": 1486118347776.0, "grad_norm": 0.04139604987307943, "language_loss": 0.79610431, "learning_rate": 0.0009257248316761906, "loss": 0.80673575, "num_input_tokens_seen": 86516384, "router_z_loss_mlp": 0.10546875, "routerloss_mlp": 0.0, "step": 1042, "time_per_iteration": 4.971017360687256 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100746, "balance_loss_mlp": 1.08498645, "diversity_loss_mlp": 0.0, "epoch": 0.20065409772989612, "flos": 496528247808.0, "grad_norm": 0.08950731646766712, "language_loss": 0.81070006, "learning_rate": 0.0009255613649386244, "loss": 0.82170749, "num_input_tokens_seen": 86587296, "router_z_loss_mlp": 0.1574707, "routerloss_mlp": 0.0, "step": 1043, "time_per_iteration": 2.6508612632751465 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091355, "balance_loss_mlp": 1.07623935, "diversity_loss_mlp": 0.0, "epoch": 0.20084647941515968, "flos": 579367572480.0, "grad_norm": 0.07614483401418765, "language_loss": 0.78829026, "learning_rate": 0.0009253977329834838, "loss": 0.79920387, "num_input_tokens_seen": 86662656, "router_z_loss_mlp": 0.15100098, "routerloss_mlp": 0.0, "step": 1044, "time_per_iteration": 2.7090582847595215 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109464, "balance_loss_mlp": 1.07947624, "diversity_loss_mlp": 0.0, "epoch": 0.20103886110042324, "flos": 642076968960.0, "grad_norm": 0.0989854096864982, "language_loss": 0.86366481, "learning_rate": 0.0009252339358742965, "loss": 0.8746112, "num_input_tokens_seen": 86734704, "router_z_loss_mlp": 0.15148926, "routerloss_mlp": 0.0, "step": 1045, "time_per_iteration": 2.801323652267456 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100349, "balance_loss_mlp": 1.08526874, "diversity_loss_mlp": 0.0, "epoch": 0.2012312427856868, "flos": 441970007040.0, "grad_norm": 0.07994799859902735, "language_loss": 0.83704323, "learning_rate": 0.000925069973674654, "loss": 0.84804672, "num_input_tokens_seen": 86806512, "router_z_loss_mlp": 0.15063477, "routerloss_mlp": 0.0, "step": 1046, "time_per_iteration": 2.6286635398864746 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.011046, "balance_loss_mlp": 1.09036636, "diversity_loss_mlp": 0.0, "epoch": 0.20142362447095036, "flos": 554402382336.0, "grad_norm": 0.05803081938267982, "language_loss": 0.88841283, "learning_rate": 0.000924905846448212, "loss": 0.89945889, "num_input_tokens_seen": 86883440, "router_z_loss_mlp": 0.14233398, "routerloss_mlp": 0.0, "step": 1047, "time_per_iteration": 2.7208023071289062 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135029, "balance_loss_mlp": 1.12078381, "diversity_loss_mlp": 0.0, "epoch": 0.20161600615621392, "flos": 670301153280.0, "grad_norm": 0.09159511175118457, "language_loss": 0.85692465, "learning_rate": 0.0009247415542586906, "loss": 0.86827493, "num_input_tokens_seen": 86960208, "router_z_loss_mlp": 0.14257812, "routerloss_mlp": 0.0, "step": 1048, "time_per_iteration": 2.8772377967834473 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0089504, "balance_loss_mlp": 1.55797935, "diversity_loss_mlp": 0.19993141, "epoch": 0.2018083878414775, "flos": 573091490304.0, "grad_norm": 0.028193920194447036, "language_loss": 0.83094788, "learning_rate": 0.0009245770971698735, "loss": 0.83989829, "num_input_tokens_seen": 87044144, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01608507, "step": 1049, "time_per_iteration": 2.922792911529541 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01143099, "balance_loss_mlp": 1.12878203, "diversity_loss_mlp": 0.0, "epoch": 0.20200076952674106, "flos": 425857844736.0, "grad_norm": 0.08345797467079887, "language_loss": 0.88434327, "learning_rate": 0.0009244124752456087, "loss": 0.89577425, "num_input_tokens_seen": 87109136, "router_z_loss_mlp": 0.14306641, "routerloss_mlp": 0.0, "step": 1050, "time_per_iteration": 2.5263967514038086 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01141188, "balance_loss_mlp": 1.12675214, "diversity_loss_mlp": 0.0, "epoch": 0.20219315121200462, "flos": 536597581824.0, "grad_norm": 0.07479960387863874, "language_loss": 0.85303241, "learning_rate": 0.0009242476885498081, "loss": 0.86444432, "num_input_tokens_seen": 87184320, "router_z_loss_mlp": 0.14416504, "routerloss_mlp": 0.0, "step": 1051, "time_per_iteration": 2.8012773990631104 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01146474, "balance_loss_mlp": 1.13181126, "diversity_loss_mlp": 0.0, "epoch": 0.20238553289726818, "flos": 477873644544.0, "grad_norm": 0.07632391919964465, "language_loss": 0.81114984, "learning_rate": 0.0009240827371464474, "loss": 0.82261455, "num_input_tokens_seen": 87248224, "router_z_loss_mlp": 0.14672852, "routerloss_mlp": 0.0, "step": 1052, "time_per_iteration": 2.546449661254883 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01146729, "balance_loss_mlp": 1.1323998, "diversity_loss_mlp": 0.0, "epoch": 0.20257791458253174, "flos": 1152057116160.0, "grad_norm": 0.11219768477147798, "language_loss": 0.84167284, "learning_rate": 0.0009239176210995666, "loss": 0.85314012, "num_input_tokens_seen": 87333088, "router_z_loss_mlp": 0.14318848, "routerloss_mlp": 0.0, "step": 1053, "time_per_iteration": 3.4905290603637695 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01153158, "balance_loss_mlp": 1.13878179, "diversity_loss_mlp": 0.0, "epoch": 0.2027702962677953, "flos": 666913678848.0, "grad_norm": 0.07345468089138417, "language_loss": 0.93850195, "learning_rate": 0.0009237523404732695, "loss": 0.95003355, "num_input_tokens_seen": 87413840, "router_z_loss_mlp": 0.14367676, "routerloss_mlp": 0.0, "step": 1054, "time_per_iteration": 2.8854215145111084 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0116831, "balance_loss_mlp": 1.15374279, "diversity_loss_mlp": 0.0, "epoch": 0.20296267795305886, "flos": 641298746880.0, "grad_norm": 0.08788286689344726, "language_loss": 0.84136868, "learning_rate": 0.0009235868953317235, "loss": 0.85305184, "num_input_tokens_seen": 87487168, "router_z_loss_mlp": 0.14562988, "routerloss_mlp": 0.0, "step": 1055, "time_per_iteration": 2.785616397857666 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0115453, "balance_loss_mlp": 1.14033246, "diversity_loss_mlp": 0.0, "epoch": 0.20315505963832242, "flos": 930575070720.0, "grad_norm": 0.07006303181868268, "language_loss": 0.85314858, "learning_rate": 0.0009234212857391602, "loss": 0.86469388, "num_input_tokens_seen": 87573184, "router_z_loss_mlp": 0.14208984, "routerloss_mlp": 0.0, "step": 1056, "time_per_iteration": 3.192293167114258 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01167757, "balance_loss_mlp": 1.15304708, "diversity_loss_mlp": 0.0, "epoch": 0.20334744132358598, "flos": 562111197696.0, "grad_norm": 0.07469852363602907, "language_loss": 0.89220309, "learning_rate": 0.000923255511759875, "loss": 0.9038806, "num_input_tokens_seen": 87651968, "router_z_loss_mlp": 0.14697266, "routerloss_mlp": 0.0, "step": 1057, "time_per_iteration": 2.783778429031372 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00881428, "balance_loss_mlp": 1.53356147, "diversity_loss_mlp": 0.1968638, "epoch": 0.20353982300884957, "flos": 644206804992.0, "grad_norm": 0.032510948660132113, "language_loss": 0.84587663, "learning_rate": 0.000923089573458227, "loss": 0.85469091, "num_input_tokens_seen": 87727792, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01621579, "step": 1058, "time_per_iteration": 2.8847100734710693 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01150746, "balance_loss_mlp": 1.13623881, "diversity_loss_mlp": 0.0, "epoch": 0.20373220469411313, "flos": 651421522944.0, "grad_norm": 0.11181454207252314, "language_loss": 0.83516467, "learning_rate": 0.0009229234708986392, "loss": 0.84667218, "num_input_tokens_seen": 87806048, "router_z_loss_mlp": 0.14477539, "routerloss_mlp": 0.0, "step": 1059, "time_per_iteration": 2.9079415798187256 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01172867, "balance_loss_mlp": 1.16251993, "diversity_loss_mlp": 0.0, "epoch": 0.2039245863793767, "flos": 1437628787712.0, "grad_norm": 0.06024273804144221, "language_loss": 0.81666899, "learning_rate": 0.0009227572041455982, "loss": 0.82839763, "num_input_tokens_seen": 88018160, "router_z_loss_mlp": 0.10351562, "routerloss_mlp": 0.0, "step": 1060, "time_per_iteration": 4.646218776702881 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112152, "balance_loss_mlp": 1.10713172, "diversity_loss_mlp": 0.0, "epoch": 0.20411696806464025, "flos": 596967169536.0, "grad_norm": 0.08928557521337042, "language_loss": 0.85345757, "learning_rate": 0.0009225907732636548, "loss": 0.86467278, "num_input_tokens_seen": 88090864, "router_z_loss_mlp": 0.1439209, "routerloss_mlp": 0.0, "step": 1061, "time_per_iteration": 2.745448112487793 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106249, "balance_loss_mlp": 1.09209883, "diversity_loss_mlp": 0.0, "epoch": 0.2043093497499038, "flos": 573803274240.0, "grad_norm": 0.079028173596336, "language_loss": 0.86936563, "learning_rate": 0.0009224241783174227, "loss": 0.88042819, "num_input_tokens_seen": 88161360, "router_z_loss_mlp": 0.14172363, "routerloss_mlp": 0.0, "step": 1062, "time_per_iteration": 2.6923935413360596 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090506, "balance_loss_mlp": 1.07616472, "diversity_loss_mlp": 0.0, "epoch": 0.20450173143516737, "flos": 630352958976.0, "grad_norm": 0.07452632641130948, "language_loss": 0.85384166, "learning_rate": 0.0009222574193715802, "loss": 0.86474669, "num_input_tokens_seen": 88234960, "router_z_loss_mlp": 0.14331055, "routerloss_mlp": 0.0, "step": 1063, "time_per_iteration": 2.7701327800750732 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092958, "balance_loss_mlp": 1.07850981, "diversity_loss_mlp": 0.0, "epoch": 0.20469411312043093, "flos": 574003335168.0, "grad_norm": 0.06517233034985846, "language_loss": 0.85915947, "learning_rate": 0.000922090496490869, "loss": 0.87008905, "num_input_tokens_seen": 88308176, "router_z_loss_mlp": 0.14440918, "routerloss_mlp": 0.0, "step": 1064, "time_per_iteration": 2.7387099266052246 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098934, "balance_loss_mlp": 1.08404493, "diversity_loss_mlp": 0.0, "epoch": 0.20488649480569449, "flos": 637053755904.0, "grad_norm": 0.06963355430403552, "language_loss": 0.89889115, "learning_rate": 0.0009219234097400937, "loss": 0.90988052, "num_input_tokens_seen": 88386768, "router_z_loss_mlp": 0.14868164, "routerloss_mlp": 0.0, "step": 1065, "time_per_iteration": 2.859334707260132 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112884, "balance_loss_mlp": 1.09778059, "diversity_loss_mlp": 0.0, "epoch": 0.20507887649095807, "flos": 975793526784.0, "grad_norm": 0.06723697540994414, "language_loss": 0.83086514, "learning_rate": 0.0009217561591841237, "loss": 0.84199405, "num_input_tokens_seen": 88476576, "router_z_loss_mlp": 0.15075684, "routerloss_mlp": 0.0, "step": 1066, "time_per_iteration": 3.3065547943115234 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00886484, "balance_loss_mlp": 1.54046464, "diversity_loss_mlp": 0.1982768, "epoch": 0.20527125817622163, "flos": 486183015936.0, "grad_norm": 0.03984406199709606, "language_loss": 0.80820358, "learning_rate": 0.0009215887448878913, "loss": 0.8170684, "num_input_tokens_seen": 88541968, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01711285, "step": 1067, "time_per_iteration": 2.6291754245758057 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01131678, "balance_loss_mlp": 1.11697936, "diversity_loss_mlp": 0.0, "epoch": 0.2054636398614852, "flos": 527178875904.0, "grad_norm": 0.07633348035576148, "language_loss": 0.85365784, "learning_rate": 0.0009214211669163922, "loss": 0.86497462, "num_input_tokens_seen": 88615296, "router_z_loss_mlp": 0.14685059, "routerloss_mlp": 0.0, "step": 1068, "time_per_iteration": 2.747936725616455 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136914, "balance_loss_mlp": 1.12220347, "diversity_loss_mlp": 0.0, "epoch": 0.20565602154674875, "flos": 558182638080.0, "grad_norm": 0.07197705825645119, "language_loss": 0.9405331, "learning_rate": 0.0009212534253346862, "loss": 0.95190227, "num_input_tokens_seen": 88691584, "router_z_loss_mlp": 0.14709473, "routerloss_mlp": 0.0, "step": 1069, "time_per_iteration": 2.696131467819214 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128507, "balance_loss_mlp": 1.11372542, "diversity_loss_mlp": 0.0, "epoch": 0.2058484032320123, "flos": 504224953344.0, "grad_norm": 0.09743186487320747, "language_loss": 0.84269625, "learning_rate": 0.0009210855202078964, "loss": 0.85398132, "num_input_tokens_seen": 88756592, "router_z_loss_mlp": 0.14770508, "routerloss_mlp": 0.0, "step": 1070, "time_per_iteration": 2.6194372177124023 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0114316, "balance_loss_mlp": 1.12903321, "diversity_loss_mlp": 0.0, "epoch": 0.20604078491727587, "flos": 433169109504.0, "grad_norm": 0.08033414700046611, "language_loss": 0.87081122, "learning_rate": 0.0009209174516012091, "loss": 0.88224292, "num_input_tokens_seen": 88820928, "router_z_loss_mlp": 0.14135742, "routerloss_mlp": 0.0, "step": 1071, "time_per_iteration": 2.5169904232025146 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01146914, "balance_loss_mlp": 1.13247752, "diversity_loss_mlp": 0.0, "epoch": 0.20623316660253943, "flos": 608711003136.0, "grad_norm": 0.06769648970134874, "language_loss": 0.89207751, "learning_rate": 0.0009207492195798747, "loss": 0.90354669, "num_input_tokens_seen": 88895440, "router_z_loss_mlp": 0.14428711, "routerloss_mlp": 0.0, "step": 1072, "time_per_iteration": 2.804577112197876 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01137485, "balance_loss_mlp": 1.12303698, "diversity_loss_mlp": 0.0, "epoch": 0.206425548287803, "flos": 480425997312.0, "grad_norm": 0.0857236005827703, "language_loss": 0.84780991, "learning_rate": 0.0009205808242092061, "loss": 0.85918474, "num_input_tokens_seen": 88964400, "router_z_loss_mlp": 0.14453125, "routerloss_mlp": 0.0, "step": 1073, "time_per_iteration": 2.6134936809539795 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01122455, "balance_loss_mlp": 1.10787559, "diversity_loss_mlp": 0.0, "epoch": 0.20661792997306658, "flos": 949429734912.0, "grad_norm": 0.09531084522047072, "language_loss": 0.82512677, "learning_rate": 0.0009204122655545808, "loss": 0.83635134, "num_input_tokens_seen": 89049600, "router_z_loss_mlp": 0.14575195, "routerloss_mlp": 0.0, "step": 1074, "time_per_iteration": 3.461315155029297 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00888955, "balance_loss_mlp": 1.54418314, "diversity_loss_mlp": 0.20175909, "epoch": 0.20681031165833014, "flos": 603487729152.0, "grad_norm": 0.03221822204199988, "language_loss": 0.80952764, "learning_rate": 0.0009202435436814388, "loss": 0.81841719, "num_input_tokens_seen": 89119024, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01598355, "step": 1075, "time_per_iteration": 2.728055238723755 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01146745, "balance_loss_mlp": 1.13259482, "diversity_loss_mlp": 0.0, "epoch": 0.2070026933435937, "flos": 708984368640.0, "grad_norm": 0.0831097658087499, "language_loss": 0.89925295, "learning_rate": 0.0009200746586552836, "loss": 0.91072041, "num_input_tokens_seen": 89197344, "router_z_loss_mlp": 0.14147949, "routerloss_mlp": 0.0, "step": 1076, "time_per_iteration": 2.929422616958618 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136182, "balance_loss_mlp": 1.12185347, "diversity_loss_mlp": 0.0, "epoch": 0.20719507502885726, "flos": 829814948352.0, "grad_norm": 0.07960863169785164, "language_loss": 0.84148425, "learning_rate": 0.0009199056105416825, "loss": 0.85284609, "num_input_tokens_seen": 89280464, "router_z_loss_mlp": 0.14318848, "routerloss_mlp": 0.0, "step": 1077, "time_per_iteration": 3.0795576572418213 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01148051, "balance_loss_mlp": 1.13384151, "diversity_loss_mlp": 0.0, "epoch": 0.20738745671412082, "flos": 638294141952.0, "grad_norm": 0.06589509494701294, "language_loss": 0.86599898, "learning_rate": 0.0009197363994062654, "loss": 0.87747955, "num_input_tokens_seen": 89353344, "router_z_loss_mlp": 0.14208984, "routerloss_mlp": 0.0, "step": 1078, "time_per_iteration": 2.8304550647735596 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00891417, "balance_loss_mlp": 1.54815006, "diversity_loss_mlp": 0.20151556, "epoch": 0.20757983839938438, "flos": 685602786816.0, "grad_norm": 0.027729032115243194, "language_loss": 0.84302026, "learning_rate": 0.0009195670253147262, "loss": 0.85193443, "num_input_tokens_seen": 89439328, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01658459, "step": 1079, "time_per_iteration": 2.987715005874634 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01168872, "balance_loss_mlp": 1.15472198, "diversity_loss_mlp": 0.0, "epoch": 0.20777222008464794, "flos": 519282109440.0, "grad_norm": 0.07878432741989363, "language_loss": 0.82508785, "learning_rate": 0.0009193974883328216, "loss": 0.83677661, "num_input_tokens_seen": 89510160, "router_z_loss_mlp": 0.14160156, "routerloss_mlp": 0.0, "step": 1080, "time_per_iteration": 2.6007754802703857 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01178335, "balance_loss_mlp": 1.16408908, "diversity_loss_mlp": 0.0, "epoch": 0.2079646017699115, "flos": 511402595328.0, "grad_norm": 0.06872318796781544, "language_loss": 0.86871535, "learning_rate": 0.0009192277885263718, "loss": 0.88049871, "num_input_tokens_seen": 89582960, "router_z_loss_mlp": 0.14233398, "routerloss_mlp": 0.0, "step": 1081, "time_per_iteration": 2.645918846130371 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0116777, "balance_loss_mlp": 1.15339386, "diversity_loss_mlp": 0.0, "epoch": 0.20815698345517505, "flos": 931820226048.0, "grad_norm": 0.08475435362049728, "language_loss": 0.86010319, "learning_rate": 0.0009190579259612602, "loss": 0.87178093, "num_input_tokens_seen": 89675488, "router_z_loss_mlp": 0.14367676, "routerloss_mlp": 0.0, "step": 1082, "time_per_iteration": 3.2688331604003906 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01153529, "balance_loss_mlp": 1.13914001, "diversity_loss_mlp": 0.0, "epoch": 0.20834936514043864, "flos": 632401302528.0, "grad_norm": 0.06676527060715894, "language_loss": 0.86419082, "learning_rate": 0.000918887900703433, "loss": 0.8757261, "num_input_tokens_seen": 89747872, "router_z_loss_mlp": 0.14379883, "routerloss_mlp": 0.0, "step": 1083, "time_per_iteration": 2.7645068168640137 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01129996, "balance_loss_mlp": 1.11559522, "diversity_loss_mlp": 0.0, "epoch": 0.2085417468257022, "flos": 394384578048.0, "grad_norm": 0.07296749014166971, "language_loss": 0.89779425, "learning_rate": 0.0009187177128188999, "loss": 0.90909421, "num_input_tokens_seen": 89810176, "router_z_loss_mlp": 0.14404297, "routerloss_mlp": 0.0, "step": 1084, "time_per_iteration": 2.441312313079834 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128832, "balance_loss_mlp": 1.11915255, "diversity_loss_mlp": 0.0, "epoch": 0.20873412851096576, "flos": 1402147293696.0, "grad_norm": 0.053207927956046876, "language_loss": 0.77156538, "learning_rate": 0.0009185473623737339, "loss": 0.78285372, "num_input_tokens_seen": 90038432, "router_z_loss_mlp": 0.09667969, "routerloss_mlp": 0.0, "step": 1085, "time_per_iteration": 4.864179849624634 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117368, "balance_loss_mlp": 1.1029439, "diversity_loss_mlp": 0.0, "epoch": 0.20892651019622932, "flos": 447830913024.0, "grad_norm": 0.07905606819783856, "language_loss": 0.85833263, "learning_rate": 0.000918376849434071, "loss": 0.86950636, "num_input_tokens_seen": 90101568, "router_z_loss_mlp": 0.14428711, "routerloss_mlp": 0.0, "step": 1086, "time_per_iteration": 4.049270868301392 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112999, "balance_loss_mlp": 1.09849179, "diversity_loss_mlp": 0.0, "epoch": 0.20911889188149288, "flos": 493106268672.0, "grad_norm": 0.08954509639668791, "language_loss": 0.90778226, "learning_rate": 0.0009182061740661098, "loss": 0.91891223, "num_input_tokens_seen": 90169344, "router_z_loss_mlp": 0.14489746, "routerloss_mlp": 0.0, "step": 1087, "time_per_iteration": 2.557358741760254 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128974, "balance_loss_mlp": 1.11446643, "diversity_loss_mlp": 0.0, "epoch": 0.20931127356675644, "flos": 841291909632.0, "grad_norm": 0.08446380837501397, "language_loss": 0.85054636, "learning_rate": 0.0009180353363361127, "loss": 0.86183608, "num_input_tokens_seen": 90252416, "router_z_loss_mlp": 0.14477539, "routerloss_mlp": 0.0, "step": 1088, "time_per_iteration": 3.0897305011749268 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118573, "balance_loss_mlp": 1.10417306, "diversity_loss_mlp": 0.0, "epoch": 0.20950365525202, "flos": 757140618240.0, "grad_norm": 0.08173869768976531, "language_loss": 0.82508695, "learning_rate": 0.0009178643363104044, "loss": 0.83627272, "num_input_tokens_seen": 90337952, "router_z_loss_mlp": 0.14379883, "routerloss_mlp": 0.0, "step": 1089, "time_per_iteration": 3.124645948410034 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113657, "balance_loss_mlp": 1.09938824, "diversity_loss_mlp": 0.0, "epoch": 0.20969603693728356, "flos": 472539142656.0, "grad_norm": 0.09307233053408402, "language_loss": 0.90518665, "learning_rate": 0.0009176931740553735, "loss": 0.9163233, "num_input_tokens_seen": 90401488, "router_z_loss_mlp": 0.14282227, "routerloss_mlp": 0.0, "step": 1090, "time_per_iteration": 2.6098225116729736 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113731, "balance_loss_mlp": 1.09981966, "diversity_loss_mlp": 0.0, "epoch": 0.20988841862254715, "flos": 976930025472.0, "grad_norm": 0.09489388322063774, "language_loss": 0.8240813, "learning_rate": 0.0009175218496374708, "loss": 0.83521861, "num_input_tokens_seen": 90486144, "router_z_loss_mlp": 0.13916016, "routerloss_mlp": 0.0, "step": 1091, "time_per_iteration": 3.336355686187744 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110612, "balance_loss_mlp": 1.09205294, "diversity_loss_mlp": 0.0, "epoch": 0.2100808003078107, "flos": 1093120634880.0, "grad_norm": 0.08870561470384966, "language_loss": 0.86057436, "learning_rate": 0.0009173503631232103, "loss": 0.87163556, "num_input_tokens_seen": 90571504, "router_z_loss_mlp": 0.14074707, "routerloss_mlp": 0.0, "step": 1092, "time_per_iteration": 3.356015682220459 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106884, "balance_loss_mlp": 1.09269798, "diversity_loss_mlp": 0.0, "epoch": 0.21027318199307427, "flos": 1012964714496.0, "grad_norm": 0.09478788106803046, "language_loss": 0.82067865, "learning_rate": 0.0009171787145791691, "loss": 0.83174753, "num_input_tokens_seen": 90646016, "router_z_loss_mlp": 0.14196777, "routerloss_mlp": 0.0, "step": 1093, "time_per_iteration": 3.2546143531799316 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01116222, "balance_loss_mlp": 1.10199988, "diversity_loss_mlp": 0.0, "epoch": 0.21046556367833782, "flos": 521394693120.0, "grad_norm": 0.14674509624116924, "language_loss": 0.80160701, "learning_rate": 0.000917006904071987, "loss": 0.81276917, "num_input_tokens_seen": 90713440, "router_z_loss_mlp": 0.14233398, "routerloss_mlp": 0.0, "step": 1094, "time_per_iteration": 2.5837080478668213 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00911953, "balance_loss_mlp": 1.58726883, "diversity_loss_mlp": 0.20477253, "epoch": 0.21065794536360138, "flos": 603717525504.0, "grad_norm": 0.035943125208157026, "language_loss": 0.8737694, "learning_rate": 0.0009168349316683669, "loss": 0.88288891, "num_input_tokens_seen": 90788208, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01593196, "step": 1095, "time_per_iteration": 2.768296718597412 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136825, "balance_loss_mlp": 1.1224122, "diversity_loss_mlp": 0.0, "epoch": 0.21085032704886494, "flos": 603346765824.0, "grad_norm": 0.06639171103878667, "language_loss": 0.82719827, "learning_rate": 0.0009166627974350741, "loss": 0.83856648, "num_input_tokens_seen": 90873776, "router_z_loss_mlp": 0.14416504, "routerloss_mlp": 0.0, "step": 1096, "time_per_iteration": 2.8819992542266846 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01145046, "balance_loss_mlp": 1.13041949, "diversity_loss_mlp": 0.0, "epoch": 0.2110427087341285, "flos": 637671564288.0, "grad_norm": 0.08337696606413014, "language_loss": 0.89929205, "learning_rate": 0.0009164905014389373, "loss": 0.91074252, "num_input_tokens_seen": 90945872, "router_z_loss_mlp": 0.14624023, "routerloss_mlp": 0.0, "step": 1097, "time_per_iteration": 2.7877442836761475 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01163813, "balance_loss_mlp": 1.1495918, "diversity_loss_mlp": 0.0, "epoch": 0.21123509041939206, "flos": 522919203840.0, "grad_norm": 0.08033808486911229, "language_loss": 0.86386079, "learning_rate": 0.0009163180437468476, "loss": 0.87549889, "num_input_tokens_seen": 91016224, "router_z_loss_mlp": 0.14221191, "routerloss_mlp": 0.0, "step": 1098, "time_per_iteration": 2.6314592361450195 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01176615, "balance_loss_mlp": 1.16195273, "diversity_loss_mlp": 0.0, "epoch": 0.21142747210465565, "flos": 451188652032.0, "grad_norm": 0.09094665560265827, "language_loss": 0.85629344, "learning_rate": 0.000916145424425759, "loss": 0.86805964, "num_input_tokens_seen": 91086752, "router_z_loss_mlp": 0.1463623, "routerloss_mlp": 0.0, "step": 1099, "time_per_iteration": 2.6608541011810303 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01181873, "balance_loss_mlp": 1.16744852, "diversity_loss_mlp": 0.0, "epoch": 0.2116198537899192, "flos": 876175045632.0, "grad_norm": 0.09944182260515583, "language_loss": 0.9083795, "learning_rate": 0.0009159726435426885, "loss": 0.9201982, "num_input_tokens_seen": 91162960, "router_z_loss_mlp": 0.14416504, "routerloss_mlp": 0.0, "step": 1100, "time_per_iteration": 3.0502405166625977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01149234, "balance_loss_mlp": 1.134619, "diversity_loss_mlp": 0.0, "epoch": 0.21181223547518277, "flos": 523662921216.0, "grad_norm": 0.09151162791452093, "language_loss": 0.90900993, "learning_rate": 0.0009157997011647154, "loss": 0.92050231, "num_input_tokens_seen": 91229840, "router_z_loss_mlp": 0.14611816, "routerloss_mlp": 0.0, "step": 1101, "time_per_iteration": 2.6048476696014404 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01127613, "balance_loss_mlp": 1.11389172, "diversity_loss_mlp": 0.0, "epoch": 0.21200461716044633, "flos": 572296015872.0, "grad_norm": 0.07696729699318336, "language_loss": 0.86130077, "learning_rate": 0.0009156265973589817, "loss": 0.87257689, "num_input_tokens_seen": 91307936, "router_z_loss_mlp": 0.13745117, "routerloss_mlp": 0.0, "step": 1102, "time_per_iteration": 2.7552144527435303 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01114805, "balance_loss_mlp": 1.10088181, "diversity_loss_mlp": 0.0, "epoch": 0.2121969988457099, "flos": 545129409024.0, "grad_norm": 0.07661877314329607, "language_loss": 0.89485067, "learning_rate": 0.0009154533321926926, "loss": 0.90599877, "num_input_tokens_seen": 91372848, "router_z_loss_mlp": 0.13909912, "routerloss_mlp": 0.0, "step": 1103, "time_per_iteration": 4.073851108551025 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105254, "balance_loss_mlp": 1.09134197, "diversity_loss_mlp": 0.0, "epoch": 0.21238938053097345, "flos": 843861514752.0, "grad_norm": 0.08363594534482698, "language_loss": 0.8717171, "learning_rate": 0.0009152799057331156, "loss": 0.88276958, "num_input_tokens_seen": 91452768, "router_z_loss_mlp": 0.13928223, "routerloss_mlp": 0.0, "step": 1104, "time_per_iteration": 3.142221450805664 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100132, "balance_loss_mlp": 1.08656633, "diversity_loss_mlp": 0.0, "epoch": 0.212581762216237, "flos": 446214998016.0, "grad_norm": 0.1056362594360365, "language_loss": 0.91270363, "learning_rate": 0.0009151063180475805, "loss": 0.92370498, "num_input_tokens_seen": 91519888, "router_z_loss_mlp": 0.13598633, "routerloss_mlp": 0.0, "step": 1105, "time_per_iteration": 2.512547016143799 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095772, "balance_loss_mlp": 1.08196795, "diversity_loss_mlp": 0.0, "epoch": 0.21277414390150057, "flos": 514380036096.0, "grad_norm": 0.08072473316090223, "language_loss": 0.84285367, "learning_rate": 0.0009149325692034803, "loss": 0.85381138, "num_input_tokens_seen": 91585744, "router_z_loss_mlp": 0.13818359, "routerloss_mlp": 0.0, "step": 1106, "time_per_iteration": 2.5711469650268555 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071395, "balance_loss_mlp": 1.06266928, "diversity_loss_mlp": 0.0, "epoch": 0.21296652558676413, "flos": 1485532846080.0, "grad_norm": 0.04229613635199888, "language_loss": 0.79203427, "learning_rate": 0.0009147586592682702, "loss": 0.8027482, "num_input_tokens_seen": 91805840, "router_z_loss_mlp": 0.08740234, "routerloss_mlp": 0.0, "step": 1107, "time_per_iteration": 4.817704916000366 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01129097, "balance_loss_mlp": 1.11547112, "diversity_loss_mlp": 0.0, "epoch": 0.21315890727202771, "flos": 846040909824.0, "grad_norm": 0.07382538641756346, "language_loss": 0.8748607, "learning_rate": 0.0009145845883094678, "loss": 0.88615161, "num_input_tokens_seen": 91885936, "router_z_loss_mlp": 0.13659668, "routerloss_mlp": 0.0, "step": 1108, "time_per_iteration": 3.039318561553955 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01150208, "balance_loss_mlp": 1.13671303, "diversity_loss_mlp": 0.0, "epoch": 0.21335128895729127, "flos": 629379445248.0, "grad_norm": 0.07887220377556703, "language_loss": 0.85174125, "learning_rate": 0.000914410356394654, "loss": 0.86324334, "num_input_tokens_seen": 91959888, "router_z_loss_mlp": 0.13525391, "routerloss_mlp": 0.0, "step": 1109, "time_per_iteration": 2.76413893699646 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0116209, "balance_loss_mlp": 1.1484766, "diversity_loss_mlp": 0.0, "epoch": 0.21354367064255483, "flos": 710975812608.0, "grad_norm": 0.06362602917472766, "language_loss": 0.84447891, "learning_rate": 0.0009142359635914709, "loss": 0.85609984, "num_input_tokens_seen": 92043728, "router_z_loss_mlp": 0.13635254, "routerloss_mlp": 0.0, "step": 1110, "time_per_iteration": 3.007201671600342 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01163563, "balance_loss_mlp": 1.15004468, "diversity_loss_mlp": 0.0, "epoch": 0.2137360523278184, "flos": 456201953280.0, "grad_norm": 0.07633144605420673, "language_loss": 0.84598219, "learning_rate": 0.0009140614099676245, "loss": 0.85761786, "num_input_tokens_seen": 92114096, "router_z_loss_mlp": 0.13537598, "routerloss_mlp": 0.0, "step": 1111, "time_per_iteration": 2.569401979446411 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01161722, "balance_loss_mlp": 1.14807272, "diversity_loss_mlp": 0.0, "epoch": 0.21392843401308195, "flos": 666051393024.0, "grad_norm": 0.0712977258009472, "language_loss": 0.82590818, "learning_rate": 0.0009138866955908821, "loss": 0.83752549, "num_input_tokens_seen": 92193552, "router_z_loss_mlp": 0.13671875, "routerloss_mlp": 0.0, "step": 1112, "time_per_iteration": 2.870701789855957 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01166663, "balance_loss_mlp": 1.15294182, "diversity_loss_mlp": 0.0, "epoch": 0.2141208156983455, "flos": 748996803072.0, "grad_norm": 0.09239605609063735, "language_loss": 0.80485952, "learning_rate": 0.0009137118205290738, "loss": 0.81652606, "num_input_tokens_seen": 92279248, "router_z_loss_mlp": 0.13739014, "routerloss_mlp": 0.0, "step": 1113, "time_per_iteration": 2.9623591899871826 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01174843, "balance_loss_mlp": 1.16082442, "diversity_loss_mlp": 0.0, "epoch": 0.21431319738360907, "flos": 419119971840.0, "grad_norm": 0.08763873550503462, "language_loss": 0.90553653, "learning_rate": 0.0009135367848500924, "loss": 0.91728497, "num_input_tokens_seen": 92344064, "router_z_loss_mlp": 0.14025879, "routerloss_mlp": 0.0, "step": 1114, "time_per_iteration": 2.5287492275238037 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01165146, "balance_loss_mlp": 1.15138936, "diversity_loss_mlp": 0.0, "epoch": 0.21450557906887263, "flos": 609126179328.0, "grad_norm": 0.11593363319598911, "language_loss": 0.86361086, "learning_rate": 0.0009133615886218927, "loss": 0.87526232, "num_input_tokens_seen": 92410544, "router_z_loss_mlp": 0.13769531, "routerloss_mlp": 0.0, "step": 1115, "time_per_iteration": 2.6945505142211914 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01141616, "balance_loss_mlp": 1.12725139, "diversity_loss_mlp": 0.0, "epoch": 0.21469796075413622, "flos": 561913708032.0, "grad_norm": 0.08371979294567897, "language_loss": 0.87389791, "learning_rate": 0.0009131862319124917, "loss": 0.88531411, "num_input_tokens_seen": 92480272, "router_z_loss_mlp": 0.14367676, "routerloss_mlp": 0.0, "step": 1116, "time_per_iteration": 2.6219210624694824 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130003, "balance_loss_mlp": 1.1162107, "diversity_loss_mlp": 0.0, "epoch": 0.21489034243939978, "flos": 594637272576.0, "grad_norm": 0.08272793517794225, "language_loss": 0.83981287, "learning_rate": 0.0009130107147899691, "loss": 0.85111284, "num_input_tokens_seen": 92555584, "router_z_loss_mlp": 0.13806152, "routerloss_mlp": 0.0, "step": 1117, "time_per_iteration": 2.698151111602783 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118094, "balance_loss_mlp": 1.1039083, "diversity_loss_mlp": 0.0, "epoch": 0.21508272412466334, "flos": 441898426368.0, "grad_norm": 0.4685945915436946, "language_loss": 0.85086691, "learning_rate": 0.0009128350373224665, "loss": 0.86204791, "num_input_tokens_seen": 92623136, "router_z_loss_mlp": 0.14172363, "routerloss_mlp": 0.0, "step": 1118, "time_per_iteration": 2.545565128326416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059182, "balance_loss_mlp": 1.04950213, "diversity_loss_mlp": 0.0, "epoch": 0.2152751058099269, "flos": 1496162202624.0, "grad_norm": 0.03761711697708654, "language_loss": 0.81456429, "learning_rate": 0.0009126591995781883, "loss": 0.82515609, "num_input_tokens_seen": 92842608, "router_z_loss_mlp": 0.09667969, "routerloss_mlp": 0.0, "step": 1119, "time_per_iteration": 4.648902416229248 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118843, "balance_loss_mlp": 1.10412121, "diversity_loss_mlp": 0.0, "epoch": 0.21546748749519046, "flos": 494005630464.0, "grad_norm": 0.07492511871579786, "language_loss": 0.85205054, "learning_rate": 0.0009124832016254005, "loss": 0.86323893, "num_input_tokens_seen": 92912960, "router_z_loss_mlp": 0.1472168, "routerloss_mlp": 0.0, "step": 1120, "time_per_iteration": 2.5875513553619385 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112958, "balance_loss_mlp": 1.11404657, "diversity_loss_mlp": 0.0, "epoch": 0.21565986918045402, "flos": 634531138560.0, "grad_norm": 0.10623123993924175, "language_loss": 0.88117284, "learning_rate": 0.0009123070435324316, "loss": 0.89246857, "num_input_tokens_seen": 92982272, "router_z_loss_mlp": 0.15515137, "routerloss_mlp": 0.0, "step": 1121, "time_per_iteration": 2.752814769744873 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119114, "balance_loss_mlp": 1.10852826, "diversity_loss_mlp": 0.0, "epoch": 0.21585225086571758, "flos": 1583359570944.0, "grad_norm": 0.05861429426141409, "language_loss": 0.77875781, "learning_rate": 0.0009121307253676722, "loss": 0.78994894, "num_input_tokens_seen": 93218752, "router_z_loss_mlp": 0.10595703, "routerloss_mlp": 0.0, "step": 1122, "time_per_iteration": 4.993450880050659 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0114894, "balance_loss_mlp": 1.13229823, "diversity_loss_mlp": 0.0, "epoch": 0.21604463255098114, "flos": 684103242240.0, "grad_norm": 0.09758120262844092, "language_loss": 0.86477894, "learning_rate": 0.0009119542471995752, "loss": 0.87626839, "num_input_tokens_seen": 93293968, "router_z_loss_mlp": 0.16650391, "routerloss_mlp": 0.0, "step": 1123, "time_per_iteration": 2.8260560035705566 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01132116, "balance_loss_mlp": 1.1160109, "diversity_loss_mlp": 0.0, "epoch": 0.2162370142362447, "flos": 780989133312.0, "grad_norm": 0.1175490331770948, "language_loss": 0.81597894, "learning_rate": 0.0009117776090966554, "loss": 0.82730007, "num_input_tokens_seen": 93367088, "router_z_loss_mlp": 0.16101074, "routerloss_mlp": 0.0, "step": 1124, "time_per_iteration": 2.955768585205078 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01133281, "balance_loss_mlp": 1.1166153, "diversity_loss_mlp": 0.0, "epoch": 0.21642939592150828, "flos": 1002147406848.0, "grad_norm": 0.08908783615486303, "language_loss": 0.86717665, "learning_rate": 0.0009116008111274899, "loss": 0.87850952, "num_input_tokens_seen": 93452944, "router_z_loss_mlp": 0.16674805, "routerloss_mlp": 0.0, "step": 1125, "time_per_iteration": 3.2493131160736084 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01038655, "balance_loss_mlp": 1.02921367, "diversity_loss_mlp": 0.0, "epoch": 0.21662177760677184, "flos": 1482644238336.0, "grad_norm": 0.03267712428803131, "language_loss": 0.79106927, "learning_rate": 0.0009114238533607176, "loss": 0.80145574, "num_input_tokens_seen": 93677328, "router_z_loss_mlp": 0.09423828, "routerloss_mlp": 0.0, "step": 1126, "time_per_iteration": 4.8121678829193115 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01148434, "balance_loss_mlp": 1.13257909, "diversity_loss_mlp": 0.0, "epoch": 0.2168141592920354, "flos": 887395046400.0, "grad_norm": 0.09699177011816186, "language_loss": 0.85244691, "learning_rate": 0.0009112467358650396, "loss": 0.86393118, "num_input_tokens_seen": 93756848, "router_z_loss_mlp": 0.15856934, "routerloss_mlp": 0.0, "step": 1127, "time_per_iteration": 3.144075393676758 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01166528, "balance_loss_mlp": 1.15056634, "diversity_loss_mlp": 0.0, "epoch": 0.21700654097729896, "flos": 545961959424.0, "grad_norm": 0.07985175184807933, "language_loss": 0.86319685, "learning_rate": 0.0009110694587092192, "loss": 0.87486213, "num_input_tokens_seen": 93834704, "router_z_loss_mlp": 0.1595459, "routerloss_mlp": 0.0, "step": 1128, "time_per_iteration": 2.7497644424438477 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01179675, "balance_loss_mlp": 1.1634866, "diversity_loss_mlp": 0.0, "epoch": 0.21719892266256252, "flos": 509522379264.0, "grad_norm": 0.1038215552752292, "language_loss": 0.81267089, "learning_rate": 0.0009108920219620815, "loss": 0.82446766, "num_input_tokens_seen": 93904448, "router_z_loss_mlp": 0.16186523, "routerloss_mlp": 0.0, "step": 1129, "time_per_iteration": 2.6150496006011963 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01195026, "balance_loss_mlp": 1.1788609, "diversity_loss_mlp": 0.0, "epoch": 0.21739130434782608, "flos": 543412177920.0, "grad_norm": 0.06771714561059723, "language_loss": 0.89286679, "learning_rate": 0.0009107144256925133, "loss": 0.9048171, "num_input_tokens_seen": 93979312, "router_z_loss_mlp": 0.16162109, "routerloss_mlp": 0.0, "step": 1130, "time_per_iteration": 2.6569926738739014 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01196317, "balance_loss_mlp": 1.18006873, "diversity_loss_mlp": 0.0, "epoch": 0.21758368603308964, "flos": 616847477760.0, "grad_norm": 0.08333124164895586, "language_loss": 0.82520813, "learning_rate": 0.0009105366699694638, "loss": 0.83717132, "num_input_tokens_seen": 94052032, "router_z_loss_mlp": 0.16247559, "routerloss_mlp": 0.0, "step": 1131, "time_per_iteration": 2.7384698390960693 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01200769, "balance_loss_mlp": 1.18390059, "diversity_loss_mlp": 0.0, "epoch": 0.2177760677183532, "flos": 635116640256.0, "grad_norm": 0.07018840625680964, "language_loss": 0.81826723, "learning_rate": 0.0009103587548619439, "loss": 0.83027488, "num_input_tokens_seen": 94124944, "router_z_loss_mlp": 0.16882324, "routerloss_mlp": 0.0, "step": 1132, "time_per_iteration": 2.8361291885375977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01188026, "balance_loss_mlp": 1.17064476, "diversity_loss_mlp": 0.0, "epoch": 0.2179684494036168, "flos": 532463818752.0, "grad_norm": 0.08238158624987729, "language_loss": 0.85952497, "learning_rate": 0.0009101806804390261, "loss": 0.87140524, "num_input_tokens_seen": 94200384, "router_z_loss_mlp": 0.1739502, "routerloss_mlp": 0.0, "step": 1133, "time_per_iteration": 2.8646528720855713 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00846565, "balance_loss_mlp": 1.45559311, "diversity_loss_mlp": 0.20202307, "epoch": 0.21816083108888035, "flos": 475219975680.0, "grad_norm": 0.03511986753794681, "language_loss": 0.90682399, "learning_rate": 0.0009100024467698453, "loss": 0.91528964, "num_input_tokens_seen": 94266992, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01775702, "step": 1134, "time_per_iteration": 2.628955364227295 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0119036, "balance_loss_mlp": 1.17289567, "diversity_loss_mlp": 0.0, "epoch": 0.2183532127741439, "flos": 577467532800.0, "grad_norm": 0.09831196896097749, "language_loss": 0.82889581, "learning_rate": 0.0009098240539235981, "loss": 0.84079945, "num_input_tokens_seen": 94334304, "router_z_loss_mlp": 0.17492676, "routerloss_mlp": 0.0, "step": 1135, "time_per_iteration": 2.6857638359069824 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01179858, "balance_loss_mlp": 1.16191649, "diversity_loss_mlp": 0.0, "epoch": 0.21854559445940747, "flos": 594120780288.0, "grad_norm": 0.07855046788509763, "language_loss": 0.87649047, "learning_rate": 0.0009096455019695423, "loss": 0.88828909, "num_input_tokens_seen": 94413296, "router_z_loss_mlp": 0.1796875, "routerloss_mlp": 0.0, "step": 1136, "time_per_iteration": 2.814746856689453 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01175201, "balance_loss_mlp": 1.15702188, "diversity_loss_mlp": 0.0, "epoch": 0.21873797614467103, "flos": 408680764416.0, "grad_norm": 0.090535881946018, "language_loss": 0.89789271, "learning_rate": 0.000909466790976998, "loss": 0.90964472, "num_input_tokens_seen": 94475840, "router_z_loss_mlp": 0.18188477, "routerloss_mlp": 0.0, "step": 1137, "time_per_iteration": 2.503934144973755 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01151304, "balance_loss_mlp": 1.13231349, "diversity_loss_mlp": 0.0, "epoch": 0.21893035782993459, "flos": 894189818880.0, "grad_norm": 0.07386356915969775, "language_loss": 0.82546908, "learning_rate": 0.0009092879210153473, "loss": 0.83698207, "num_input_tokens_seen": 94555184, "router_z_loss_mlp": 0.18981934, "routerloss_mlp": 0.0, "step": 1138, "time_per_iteration": 3.106015682220459 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01143167, "balance_loss_mlp": 1.12445128, "diversity_loss_mlp": 0.0, "epoch": 0.21912273951519814, "flos": 467627157504.0, "grad_norm": 0.08443059177839436, "language_loss": 0.89126158, "learning_rate": 0.0009091088921540333, "loss": 0.90269327, "num_input_tokens_seen": 94622656, "router_z_loss_mlp": 0.18701172, "routerloss_mlp": 0.0, "step": 1139, "time_per_iteration": 2.5165584087371826 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01197317, "balance_loss_mlp": 1.18491888, "diversity_loss_mlp": 0.0, "epoch": 0.2193151212004617, "flos": 1532043445248.0, "grad_norm": 0.06938907882855633, "language_loss": 0.75508678, "learning_rate": 0.0009089297044625615, "loss": 0.76705992, "num_input_tokens_seen": 94856496, "router_z_loss_mlp": 0.12402344, "routerloss_mlp": 0.0, "step": 1140, "time_per_iteration": 4.907839775085449 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00845315, "balance_loss_mlp": 1.45913088, "diversity_loss_mlp": 0.19676474, "epoch": 0.2195075028857253, "flos": 591175646208.0, "grad_norm": 0.04157801253712285, "language_loss": 0.84799111, "learning_rate": 0.0009087503580104985, "loss": 0.8564443, "num_input_tokens_seen": 94926880, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01736734, "step": 1141, "time_per_iteration": 2.6928980350494385 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106137, "balance_loss_mlp": 1.08643126, "diversity_loss_mlp": 0.0, "epoch": 0.21969988457098885, "flos": 636329862144.0, "grad_norm": 0.09652849342648293, "language_loss": 0.7964108, "learning_rate": 0.0009085708528674728, "loss": 0.80747211, "num_input_tokens_seen": 95000528, "router_z_loss_mlp": 0.19689941, "routerloss_mlp": 0.0, "step": 1142, "time_per_iteration": 2.7800490856170654 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115066, "balance_loss_mlp": 1.09476519, "diversity_loss_mlp": 0.0, "epoch": 0.2198922662562524, "flos": 912350324736.0, "grad_norm": 0.11345906914127299, "language_loss": 0.8700006, "learning_rate": 0.0009083911891031745, "loss": 0.88115132, "num_input_tokens_seen": 95081040, "router_z_loss_mlp": 0.20300293, "routerloss_mlp": 0.0, "step": 1143, "time_per_iteration": 3.104893684387207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110402, "balance_loss_mlp": 1.08533978, "diversity_loss_mlp": 0.0, "epoch": 0.22008464794151597, "flos": 822980528640.0, "grad_norm": 0.12428556161586228, "language_loss": 0.91569418, "learning_rate": 0.0009082113667873553, "loss": 0.92673439, "num_input_tokens_seen": 95167328, "router_z_loss_mlp": 0.18676758, "routerloss_mlp": 0.0, "step": 1144, "time_per_iteration": 3.0838277339935303 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138926, "balance_loss_mlp": 1.12060392, "diversity_loss_mlp": 0.0, "epoch": 0.22027702962677953, "flos": 459656239104.0, "grad_norm": 0.0955721440223133, "language_loss": 0.90911627, "learning_rate": 0.0009080313859898283, "loss": 0.92050546, "num_input_tokens_seen": 95230304, "router_z_loss_mlp": 0.18334961, "routerloss_mlp": 0.0, "step": 1145, "time_per_iteration": 2.4998109340667725 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01162548, "balance_loss_mlp": 1.14463091, "diversity_loss_mlp": 0.0, "epoch": 0.2204694113120431, "flos": 531255739392.0, "grad_norm": 0.07871728913387968, "language_loss": 0.91642439, "learning_rate": 0.0009078512467804684, "loss": 0.92804986, "num_input_tokens_seen": 95299520, "router_z_loss_mlp": 0.17932129, "routerloss_mlp": 0.0, "step": 1146, "time_per_iteration": 2.583137273788452 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01192448, "balance_loss_mlp": 1.17516243, "diversity_loss_mlp": 0.0, "epoch": 0.22066179299730665, "flos": 522642419712.0, "grad_norm": 0.10815580627735921, "language_loss": 0.90245295, "learning_rate": 0.0009076709492292119, "loss": 0.91437739, "num_input_tokens_seen": 95368912, "router_z_loss_mlp": 0.1730957, "routerloss_mlp": 0.0, "step": 1147, "time_per_iteration": 2.6189510822296143 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01199389, "balance_loss_mlp": 1.18260384, "diversity_loss_mlp": 0.0, "epoch": 0.2208541746825702, "flos": 546451287552.0, "grad_norm": 0.10018226205073696, "language_loss": 0.88948917, "learning_rate": 0.0009074904934060562, "loss": 0.90148306, "num_input_tokens_seen": 95440800, "router_z_loss_mlp": 0.16796875, "routerloss_mlp": 0.0, "step": 1148, "time_per_iteration": 2.6619913578033447 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0119284, "balance_loss_mlp": 1.17623389, "diversity_loss_mlp": 0.0, "epoch": 0.22104655636783377, "flos": 708734748672.0, "grad_norm": 0.09879445691718633, "language_loss": 0.85041308, "learning_rate": 0.0009073098793810607, "loss": 0.8623414, "num_input_tokens_seen": 95519904, "router_z_loss_mlp": 0.1661377, "routerloss_mlp": 0.0, "step": 1149, "time_per_iteration": 2.9382119178771973 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01185083, "balance_loss_mlp": 1.16848898, "diversity_loss_mlp": 0.0, "epoch": 0.22123893805309736, "flos": 584867630592.0, "grad_norm": 0.09716543961816822, "language_loss": 0.88557786, "learning_rate": 0.000907129107224346, "loss": 0.89742863, "num_input_tokens_seen": 95591568, "router_z_loss_mlp": 0.16601562, "routerloss_mlp": 0.0, "step": 1150, "time_per_iteration": 2.717400550842285 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01190142, "balance_loss_mlp": 1.17356002, "diversity_loss_mlp": 0.0, "epoch": 0.22143131973836092, "flos": 492251323392.0, "grad_norm": 0.0741661773141201, "language_loss": 0.88313866, "learning_rate": 0.0009069481770060939, "loss": 0.89504004, "num_input_tokens_seen": 95664480, "router_z_loss_mlp": 0.16589355, "routerloss_mlp": 0.0, "step": 1151, "time_per_iteration": 2.676938056945801 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0118655, "balance_loss_mlp": 1.17039752, "diversity_loss_mlp": 0.0, "epoch": 0.22162370142362448, "flos": 1079674251264.0, "grad_norm": 0.06827936796637825, "language_loss": 0.83848286, "learning_rate": 0.000906767088796548, "loss": 0.85034835, "num_input_tokens_seen": 95754400, "router_z_loss_mlp": 0.16149902, "routerloss_mlp": 0.0, "step": 1152, "time_per_iteration": 3.442782163619995 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01185002, "balance_loss_mlp": 1.16889715, "diversity_loss_mlp": 0.0, "epoch": 0.22181608310888803, "flos": 492508283904.0, "grad_norm": 0.07358747282835834, "language_loss": 0.87001419, "learning_rate": 0.0009065858426660127, "loss": 0.88186425, "num_input_tokens_seen": 95826944, "router_z_loss_mlp": 0.16101074, "routerloss_mlp": 0.0, "step": 1153, "time_per_iteration": 2.6501753330230713 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01178927, "balance_loss_mlp": 1.16286922, "diversity_loss_mlp": 0.0, "epoch": 0.2220084647941516, "flos": 724014360576.0, "grad_norm": 0.0863709920952229, "language_loss": 0.84764236, "learning_rate": 0.0009064044386848543, "loss": 0.85943162, "num_input_tokens_seen": 95902688, "router_z_loss_mlp": 0.16052246, "routerloss_mlp": 0.0, "step": 1154, "time_per_iteration": 2.920689344406128 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01176891, "balance_loss_mlp": 1.16032064, "diversity_loss_mlp": 0.0, "epoch": 0.22220084647941515, "flos": 489239377920.0, "grad_norm": 0.07669791788600007, "language_loss": 0.88829726, "learning_rate": 0.0009062228769234997, "loss": 0.90006614, "num_input_tokens_seen": 95969952, "router_z_loss_mlp": 0.16577148, "routerloss_mlp": 0.0, "step": 1155, "time_per_iteration": 2.561638832092285 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01154629, "balance_loss_mlp": 1.13797593, "diversity_loss_mlp": 0.0, "epoch": 0.2223932281646787, "flos": 536278952448.0, "grad_norm": 0.08447027490527963, "language_loss": 0.81123281, "learning_rate": 0.0009060411574524376, "loss": 0.82277906, "num_input_tokens_seen": 96037344, "router_z_loss_mlp": 0.16662598, "routerloss_mlp": 0.0, "step": 1156, "time_per_iteration": 2.655132293701172 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01162354, "balance_loss_mlp": 1.14597416, "diversity_loss_mlp": 0.0, "epoch": 0.22258560984994227, "flos": 931420104192.0, "grad_norm": 0.08665349089557017, "language_loss": 0.87817705, "learning_rate": 0.0009058592803422178, "loss": 0.88980061, "num_input_tokens_seen": 96115616, "router_z_loss_mlp": 0.16381836, "routerloss_mlp": 0.0, "step": 1157, "time_per_iteration": 3.1417362689971924 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01183028, "balance_loss_mlp": 1.17430186, "diversity_loss_mlp": 0.0, "epoch": 0.22277799153520586, "flos": 1199675930112.0, "grad_norm": 0.06198684812147071, "language_loss": 0.78710288, "learning_rate": 0.0009056772456634512, "loss": 0.79893315, "num_input_tokens_seen": 96333600, "router_z_loss_mlp": 0.08740234, "routerloss_mlp": 0.0, "step": 1158, "time_per_iteration": 4.867843866348267 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128672, "balance_loss_mlp": 1.1120069, "diversity_loss_mlp": 0.0, "epoch": 0.22297037322046942, "flos": 501304412160.0, "grad_norm": 0.0864152607347894, "language_loss": 0.90156865, "learning_rate": 0.00090549505348681, "loss": 0.91285539, "num_input_tokens_seen": 96402544, "router_z_loss_mlp": 0.16674805, "routerloss_mlp": 0.0, "step": 1159, "time_per_iteration": 2.581865072250366 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118982, "balance_loss_mlp": 1.1025548, "diversity_loss_mlp": 0.0, "epoch": 0.22316275490573298, "flos": 752752465920.0, "grad_norm": 0.07056827667929483, "language_loss": 0.83819324, "learning_rate": 0.0009053127038830275, "loss": 0.84938306, "num_input_tokens_seen": 96487600, "router_z_loss_mlp": 0.16430664, "routerloss_mlp": 0.0, "step": 1160, "time_per_iteration": 2.9969708919525146 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00881169, "balance_loss_mlp": 1.53314447, "diversity_loss_mlp": 0.19063006, "epoch": 0.22335513659099654, "flos": 514802552832.0, "grad_norm": 0.04002382495760162, "language_loss": 0.87460124, "learning_rate": 0.000905130196922898, "loss": 0.88341296, "num_input_tokens_seen": 96554912, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01928164, "step": 1161, "time_per_iteration": 2.6307718753814697 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00881407, "balance_loss_mlp": 1.5316093, "diversity_loss_mlp": 0.19140732, "epoch": 0.2235475182762601, "flos": 484530024960.0, "grad_norm": 0.030280826501304762, "language_loss": 0.86784196, "learning_rate": 0.0009049475326772769, "loss": 0.87665606, "num_input_tokens_seen": 96624192, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01989887, "step": 1162, "time_per_iteration": 2.6021478176116943 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00889034, "balance_loss_mlp": 1.54766631, "diversity_loss_mlp": 0.19066738, "epoch": 0.22373989996152366, "flos": 469971735552.0, "grad_norm": 0.03198536270345376, "language_loss": 0.83124602, "learning_rate": 0.0009047647112170811, "loss": 0.84013629, "num_input_tokens_seen": 96701040, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01986698, "step": 1163, "time_per_iteration": 2.804150342941284 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01123868, "balance_loss_mlp": 1.1070838, "diversity_loss_mlp": 0.0, "epoch": 0.22393228164678722, "flos": 1271012249088.0, "grad_norm": 0.09901141435665076, "language_loss": 0.87948084, "learning_rate": 0.0009045817326132876, "loss": 0.89071947, "num_input_tokens_seen": 96791200, "router_z_loss_mlp": 0.16796875, "routerloss_mlp": 0.0, "step": 1164, "time_per_iteration": 3.6840732097625732 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01125369, "balance_loss_mlp": 1.107988, "diversity_loss_mlp": 0.0, "epoch": 0.22412466333205078, "flos": 596334680064.0, "grad_norm": 0.08432013167879508, "language_loss": 0.83142793, "learning_rate": 0.0009043985969369357, "loss": 0.84268159, "num_input_tokens_seen": 96869360, "router_z_loss_mlp": 0.17407227, "routerloss_mlp": 0.0, "step": 1165, "time_per_iteration": 2.8148193359375 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01146301, "balance_loss_mlp": 1.12976706, "diversity_loss_mlp": 0.0, "epoch": 0.22431704501731436, "flos": 608434219008.0, "grad_norm": 0.06944445596490195, "language_loss": 0.84334069, "learning_rate": 0.0009042153042591245, "loss": 0.85480368, "num_input_tokens_seen": 96945840, "router_z_loss_mlp": 0.16540527, "routerloss_mlp": 0.0, "step": 1166, "time_per_iteration": 2.8004493713378906 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01142176, "balance_loss_mlp": 1.12542677, "diversity_loss_mlp": 0.0, "epoch": 0.22450942670257792, "flos": 906583394304.0, "grad_norm": 0.06821660135571728, "language_loss": 0.85225487, "learning_rate": 0.0009040318546510146, "loss": 0.86367661, "num_input_tokens_seen": 97029296, "router_z_loss_mlp": 0.16760254, "routerloss_mlp": 0.0, "step": 1167, "time_per_iteration": 3.1969215869903564 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01156354, "balance_loss_mlp": 1.13979554, "diversity_loss_mlp": 0.0, "epoch": 0.22470180838784148, "flos": 565301182464.0, "grad_norm": 0.06547364647617461, "language_loss": 0.84988701, "learning_rate": 0.0009038482481838275, "loss": 0.86145055, "num_input_tokens_seen": 97097776, "router_z_loss_mlp": 0.16564941, "routerloss_mlp": 0.0, "step": 1168, "time_per_iteration": 2.7087180614471436 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00861334, "balance_loss_mlp": 1.49333596, "diversity_loss_mlp": 0.19261675, "epoch": 0.22489419007310504, "flos": 834469972992.0, "grad_norm": 0.02892951533663535, "language_loss": 0.87266529, "learning_rate": 0.0009036644849288455, "loss": 0.88127863, "num_input_tokens_seen": 97181424, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01835741, "step": 1169, "time_per_iteration": 3.1039352416992188 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01179898, "balance_loss_mlp": 1.1631248, "diversity_loss_mlp": 0.0, "epoch": 0.2250865717583686, "flos": 581057639424.0, "grad_norm": 0.06865085555084699, "language_loss": 0.85404736, "learning_rate": 0.0009034805649574118, "loss": 0.86584634, "num_input_tokens_seen": 97252128, "router_z_loss_mlp": 0.16784668, "routerloss_mlp": 0.0, "step": 1170, "time_per_iteration": 2.659322738647461 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01208955, "balance_loss_mlp": 1.1926589, "diversity_loss_mlp": 0.0, "epoch": 0.22527895344363216, "flos": 600406401024.0, "grad_norm": 0.07685307661183591, "language_loss": 0.85691977, "learning_rate": 0.0009032964883409308, "loss": 0.86900926, "num_input_tokens_seen": 97326640, "router_z_loss_mlp": 0.16296387, "routerloss_mlp": 0.0, "step": 1171, "time_per_iteration": 2.8938751220703125 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128211, "balance_loss_mlp": 1.11910319, "diversity_loss_mlp": 0.0, "epoch": 0.22547133512889572, "flos": 1440751587840.0, "grad_norm": 0.06058864885284362, "language_loss": 0.73050535, "learning_rate": 0.000903112255150867, "loss": 0.74178743, "num_input_tokens_seen": 97553952, "router_z_loss_mlp": 0.09130859, "routerloss_mlp": 0.0, "step": 1172, "time_per_iteration": 4.983820676803589 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01217918, "balance_loss_mlp": 1.20207548, "diversity_loss_mlp": 0.0, "epoch": 0.22566371681415928, "flos": 490618156032.0, "grad_norm": 0.1048847225020503, "language_loss": 0.8717351, "learning_rate": 0.0009029278654587462, "loss": 0.88391435, "num_input_tokens_seen": 97623584, "router_z_loss_mlp": 0.1583252, "routerloss_mlp": 0.0, "step": 1173, "time_per_iteration": 2.639632225036621 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01181665, "balance_loss_mlp": 1.16508245, "diversity_loss_mlp": 0.0, "epoch": 0.22585609849942284, "flos": 604616887296.0, "grad_norm": 0.07111002228073603, "language_loss": 0.82226282, "learning_rate": 0.0009027433193361548, "loss": 0.83407944, "num_input_tokens_seen": 97695952, "router_z_loss_mlp": 0.16589355, "routerloss_mlp": 0.0, "step": 1174, "time_per_iteration": 2.7443323135375977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01159983, "balance_loss_mlp": 1.14366364, "diversity_loss_mlp": 0.0, "epoch": 0.22604848018468643, "flos": 635568892416.0, "grad_norm": 0.06531304020653, "language_loss": 0.86980343, "learning_rate": 0.00090255861685474, "loss": 0.88140327, "num_input_tokens_seen": 97764544, "router_z_loss_mlp": 0.16320801, "routerloss_mlp": 0.0, "step": 1175, "time_per_iteration": 2.7534220218658447 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01142116, "balance_loss_mlp": 1.12533128, "diversity_loss_mlp": 0.0, "epoch": 0.22624086186995, "flos": 479875000320.0, "grad_norm": 0.10016618462748716, "language_loss": 0.90750074, "learning_rate": 0.0009023737580862095, "loss": 0.91892195, "num_input_tokens_seen": 97830976, "router_z_loss_mlp": 0.16796875, "routerloss_mlp": 0.0, "step": 1176, "time_per_iteration": 2.5116937160491943 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0114077, "balance_loss_mlp": 1.12470055, "diversity_loss_mlp": 0.0, "epoch": 0.22643324355521355, "flos": 495814265856.0, "grad_norm": 0.0707285441494173, "language_loss": 0.83225566, "learning_rate": 0.0009021887431023321, "loss": 0.84366333, "num_input_tokens_seen": 97898800, "router_z_loss_mlp": 0.16064453, "routerloss_mlp": 0.0, "step": 1177, "time_per_iteration": 2.599956512451172 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130034, "balance_loss_mlp": 1.11444104, "diversity_loss_mlp": 0.0, "epoch": 0.2266256252404771, "flos": 561552860160.0, "grad_norm": 0.08431891612549362, "language_loss": 0.87212515, "learning_rate": 0.0009020035719749369, "loss": 0.88342547, "num_input_tokens_seen": 97974112, "router_z_loss_mlp": 0.15576172, "routerloss_mlp": 0.0, "step": 1178, "time_per_iteration": 2.7144312858581543 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135701, "balance_loss_mlp": 1.1205014, "diversity_loss_mlp": 0.0, "epoch": 0.22681800692574067, "flos": 579688399872.0, "grad_norm": 0.09883499682369536, "language_loss": 0.77450085, "learning_rate": 0.0009018182447759136, "loss": 0.7858578, "num_input_tokens_seen": 98056640, "router_z_loss_mlp": 0.1517334, "routerloss_mlp": 0.0, "step": 1179, "time_per_iteration": 2.98848557472229 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01137145, "balance_loss_mlp": 1.12187457, "diversity_loss_mlp": 0.0, "epoch": 0.22701038861100423, "flos": 740166170112.0, "grad_norm": 0.08173095074239418, "language_loss": 0.79878223, "learning_rate": 0.0009016327615772126, "loss": 0.81015366, "num_input_tokens_seen": 98135952, "router_z_loss_mlp": 0.15246582, "routerloss_mlp": 0.0, "step": 1180, "time_per_iteration": 2.9338154792785645 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01149275, "balance_loss_mlp": 1.13449335, "diversity_loss_mlp": 0.0, "epoch": 0.2272027702962678, "flos": 577257560064.0, "grad_norm": 0.08374692364956231, "language_loss": 0.87680298, "learning_rate": 0.0009014471224508451, "loss": 0.88829577, "num_input_tokens_seen": 98204288, "router_z_loss_mlp": 0.14758301, "routerloss_mlp": 0.0, "step": 1181, "time_per_iteration": 2.7131431102752686 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00881934, "balance_loss_mlp": 1.53494334, "diversity_loss_mlp": 0.19571492, "epoch": 0.22739515198153135, "flos": 544267123200.0, "grad_norm": 0.04185105584005936, "language_loss": 0.83154267, "learning_rate": 0.0009012613274688823, "loss": 0.84036207, "num_input_tokens_seen": 98269856, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01660516, "step": 1182, "time_per_iteration": 2.649559736251831 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01184244, "balance_loss_mlp": 1.1692239, "diversity_loss_mlp": 0.0, "epoch": 0.22758753366679493, "flos": 440163942912.0, "grad_norm": 0.12019924395271459, "language_loss": 0.87753081, "learning_rate": 0.0009010753767034565, "loss": 0.8893733, "num_input_tokens_seen": 98335632, "router_z_loss_mlp": 0.14990234, "routerloss_mlp": 0.0, "step": 1183, "time_per_iteration": 2.5258986949920654 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01175003, "balance_loss_mlp": 1.16030502, "diversity_loss_mlp": 0.0, "epoch": 0.2277799153520585, "flos": 729447607296.0, "grad_norm": 0.08783280174490297, "language_loss": 0.78918862, "learning_rate": 0.0009008892702267599, "loss": 0.80093861, "num_input_tokens_seen": 98420592, "router_z_loss_mlp": 0.14685059, "routerloss_mlp": 0.0, "step": 1184, "time_per_iteration": 2.9962406158447266 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01139115, "balance_loss_mlp": 1.12460732, "diversity_loss_mlp": 0.0, "epoch": 0.22797229703732205, "flos": 526894751232.0, "grad_norm": 0.08254121322216867, "language_loss": 0.88525105, "learning_rate": 0.0009007030081110457, "loss": 0.89664215, "num_input_tokens_seen": 98488096, "router_z_loss_mlp": 0.14501953, "routerloss_mlp": 0.0, "step": 1185, "time_per_iteration": 2.5990660190582275 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01125585, "balance_loss_mlp": 1.11087465, "diversity_loss_mlp": 0.0, "epoch": 0.2281646787225856, "flos": 535431347712.0, "grad_norm": 0.07610459395316062, "language_loss": 0.84548527, "learning_rate": 0.000900516590428627, "loss": 0.85674113, "num_input_tokens_seen": 98561664, "router_z_loss_mlp": 0.14685059, "routerloss_mlp": 0.0, "step": 1186, "time_per_iteration": 2.7377407550811768 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121666, "balance_loss_mlp": 1.1070751, "diversity_loss_mlp": 0.0, "epoch": 0.22835706040784917, "flos": 541381086720.0, "grad_norm": 0.13748029932532174, "language_loss": 0.89182103, "learning_rate": 0.0009003300172518778, "loss": 0.90303767, "num_input_tokens_seen": 98634336, "router_z_loss_mlp": 0.14575195, "routerloss_mlp": 0.0, "step": 1187, "time_per_iteration": 2.6916556358337402 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01116227, "balance_loss_mlp": 1.10145736, "diversity_loss_mlp": 0.0, "epoch": 0.22854944209311273, "flos": 790637635584.0, "grad_norm": 0.11313229810108143, "language_loss": 0.84335989, "learning_rate": 0.0009001432886532321, "loss": 0.85452211, "num_input_tokens_seen": 98709600, "router_z_loss_mlp": 0.14758301, "routerloss_mlp": 0.0, "step": 1188, "time_per_iteration": 2.9698264598846436 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01114727, "balance_loss_mlp": 1.10021877, "diversity_loss_mlp": 0.0, "epoch": 0.2287418237783763, "flos": 469280148480.0, "grad_norm": 0.06729358528862889, "language_loss": 0.86774516, "learning_rate": 0.0008999564047051843, "loss": 0.87889242, "num_input_tokens_seen": 98775024, "router_z_loss_mlp": 0.14489746, "routerloss_mlp": 0.0, "step": 1189, "time_per_iteration": 2.5002098083496094 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136799, "balance_loss_mlp": 1.12243462, "diversity_loss_mlp": 0.0, "epoch": 0.22893420546363985, "flos": 468029850624.0, "grad_norm": 0.0714274855120672, "language_loss": 0.84824312, "learning_rate": 0.0008997693654802894, "loss": 0.85961115, "num_input_tokens_seen": 98845248, "router_z_loss_mlp": 0.14379883, "routerloss_mlp": 0.0, "step": 1190, "time_per_iteration": 2.6300055980682373 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01149047, "balance_loss_mlp": 1.13425303, "diversity_loss_mlp": 0.0, "epoch": 0.22912658714890344, "flos": 626258843136.0, "grad_norm": 0.07754985979781381, "language_loss": 0.86714745, "learning_rate": 0.0008995821710511625, "loss": 0.87863791, "num_input_tokens_seen": 98913584, "router_z_loss_mlp": 0.14782715, "routerloss_mlp": 0.0, "step": 1191, "time_per_iteration": 2.7126989364624023 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01162855, "balance_loss_mlp": 1.14807296, "diversity_loss_mlp": 0.0, "epoch": 0.229318968834167, "flos": 503031555072.0, "grad_norm": 0.11547698788472376, "language_loss": 0.85060751, "learning_rate": 0.0008993948214904786, "loss": 0.86223602, "num_input_tokens_seen": 98978608, "router_z_loss_mlp": 0.14770508, "routerloss_mlp": 0.0, "step": 1192, "time_per_iteration": 2.5562260150909424 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01152534, "balance_loss_mlp": 1.14361739, "diversity_loss_mlp": 0.0, "epoch": 0.22951135051943056, "flos": 1374827613696.0, "grad_norm": 0.05307726892258072, "language_loss": 0.78422213, "learning_rate": 0.0008992073168709733, "loss": 0.79574746, "num_input_tokens_seen": 99207424, "router_z_loss_mlp": 0.08935547, "routerloss_mlp": 0.0, "step": 1193, "time_per_iteration": 4.909748792648315 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01187526, "balance_loss_mlp": 1.17205215, "diversity_loss_mlp": 0.0, "epoch": 0.22970373220469412, "flos": 644345197056.0, "grad_norm": 0.09739164860103838, "language_loss": 0.78353333, "learning_rate": 0.0008990196572654427, "loss": 0.79540861, "num_input_tokens_seen": 99290592, "router_z_loss_mlp": 0.15454102, "routerloss_mlp": 0.0, "step": 1194, "time_per_iteration": 2.8592262268066406 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0117424, "balance_loss_mlp": 1.1592319, "diversity_loss_mlp": 0.0, "epoch": 0.22989611388995768, "flos": 500209758720.0, "grad_norm": 0.06260411033315277, "language_loss": 0.87559408, "learning_rate": 0.0008988318427467426, "loss": 0.88733649, "num_input_tokens_seen": 99366096, "router_z_loss_mlp": 0.14990234, "routerloss_mlp": 0.0, "step": 1195, "time_per_iteration": 2.7444722652435303 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00878316, "balance_loss_mlp": 1.52780199, "diversity_loss_mlp": 0.1948241, "epoch": 0.23008849557522124, "flos": 1096522790400.0, "grad_norm": 0.0364111048645648, "language_loss": 0.86376345, "learning_rate": 0.0008986438733877887, "loss": 0.87254667, "num_input_tokens_seen": 99456768, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01700337, "step": 1196, "time_per_iteration": 3.5090088844299316 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01137546, "balance_loss_mlp": 1.1229074, "diversity_loss_mlp": 0.0, "epoch": 0.2302808772604848, "flos": 683648418816.0, "grad_norm": 0.08413871186116019, "language_loss": 0.83810687, "learning_rate": 0.0008984557492615576, "loss": 0.84948236, "num_input_tokens_seen": 99539616, "router_z_loss_mlp": 0.14624023, "routerloss_mlp": 0.0, "step": 1197, "time_per_iteration": 2.9953744411468506 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01122899, "balance_loss_mlp": 1.10803354, "diversity_loss_mlp": 0.0, "epoch": 0.23047325894574835, "flos": 528923271168.0, "grad_norm": 0.08617240411661099, "language_loss": 0.90267789, "learning_rate": 0.0008982674704410854, "loss": 0.91390687, "num_input_tokens_seen": 99612064, "router_z_loss_mlp": 0.14880371, "routerloss_mlp": 0.0, "step": 1198, "time_per_iteration": 2.7513339519500732 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110338, "balance_loss_mlp": 1.09598517, "diversity_loss_mlp": 0.0, "epoch": 0.23066564063101191, "flos": 682766309376.0, "grad_norm": 0.11146547076727734, "language_loss": 0.77876621, "learning_rate": 0.0008980790369994682, "loss": 0.78986955, "num_input_tokens_seen": 99691040, "router_z_loss_mlp": 0.14331055, "routerloss_mlp": 0.0, "step": 1199, "time_per_iteration": 2.989825487136841 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01120977, "balance_loss_mlp": 1.10670781, "diversity_loss_mlp": 0.0, "epoch": 0.2308580223162755, "flos": 558523662336.0, "grad_norm": 0.0677628031660983, "language_loss": 0.8729977, "learning_rate": 0.000897890449009863, "loss": 0.88420743, "num_input_tokens_seen": 99762016, "router_z_loss_mlp": 0.14257812, "routerloss_mlp": 0.0, "step": 1200, "time_per_iteration": 2.6784448623657227 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01127646, "balance_loss_mlp": 1.11330509, "diversity_loss_mlp": 0.0, "epoch": 0.23105040400153906, "flos": 555669932544.0, "grad_norm": 0.080414080555838, "language_loss": 0.89825618, "learning_rate": 0.0008977017065454853, "loss": 0.90953267, "num_input_tokens_seen": 99835552, "router_z_loss_mlp": 0.14331055, "routerloss_mlp": 0.0, "step": 1201, "time_per_iteration": 2.6610703468322754 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00880483, "balance_loss_mlp": 1.52539706, "diversity_loss_mlp": 0.19880572, "epoch": 0.23124278568680262, "flos": 704788936704.0, "grad_norm": 0.03277795962214655, "language_loss": 0.80367738, "learning_rate": 0.0008975128096796121, "loss": 0.81248224, "num_input_tokens_seen": 99910784, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01838172, "step": 1202, "time_per_iteration": 2.901998996734619 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01145011, "balance_loss_mlp": 1.13089633, "diversity_loss_mlp": 0.0, "epoch": 0.23143516737206618, "flos": 612768043008.0, "grad_norm": 0.10693947298766643, "language_loss": 0.85848922, "learning_rate": 0.0008973237584855794, "loss": 0.86993933, "num_input_tokens_seen": 99991120, "router_z_loss_mlp": 0.14123535, "routerloss_mlp": 0.0, "step": 1203, "time_per_iteration": 2.872408151626587 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01160017, "balance_loss_mlp": 1.1457237, "diversity_loss_mlp": 0.0, "epoch": 0.23162754905732974, "flos": 389242796544.0, "grad_norm": 0.08753213296005687, "language_loss": 0.82586002, "learning_rate": 0.0008971345530367832, "loss": 0.83746028, "num_input_tokens_seen": 100053888, "router_z_loss_mlp": 0.14282227, "routerloss_mlp": 0.0, "step": 1204, "time_per_iteration": 2.4641921520233154 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01185717, "balance_loss_mlp": 1.17120886, "diversity_loss_mlp": 0.0, "epoch": 0.2318199307425933, "flos": 667778535936.0, "grad_norm": 0.07947534631123947, "language_loss": 0.85658818, "learning_rate": 0.0008969451934066799, "loss": 0.8684454, "num_input_tokens_seen": 100124176, "router_z_loss_mlp": 0.14489746, "routerloss_mlp": 0.0, "step": 1205, "time_per_iteration": 2.7822117805480957 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01173591, "balance_loss_mlp": 1.15872586, "diversity_loss_mlp": 0.0, "epoch": 0.23201231242785686, "flos": 666399757824.0, "grad_norm": 0.08780432716538046, "language_loss": 0.79991889, "learning_rate": 0.0008967556796687854, "loss": 0.81165481, "num_input_tokens_seen": 100205296, "router_z_loss_mlp": 0.14855957, "routerloss_mlp": 0.0, "step": 1206, "time_per_iteration": 2.8849406242370605 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0117013, "balance_loss_mlp": 1.15584886, "diversity_loss_mlp": 0.0, "epoch": 0.23220469411312042, "flos": 748816565760.0, "grad_norm": 0.07569633120476413, "language_loss": 0.83779937, "learning_rate": 0.0008965660118966752, "loss": 0.84950066, "num_input_tokens_seen": 100279440, "router_z_loss_mlp": 0.14257812, "routerloss_mlp": 0.0, "step": 1207, "time_per_iteration": 2.9316329956054688 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01146692, "balance_loss_mlp": 1.1319102, "diversity_loss_mlp": 0.0, "epoch": 0.232397075798384, "flos": 667061982720.0, "grad_norm": 0.06968265941642382, "language_loss": 0.90114093, "learning_rate": 0.0008963761901639851, "loss": 0.91260791, "num_input_tokens_seen": 100354512, "router_z_loss_mlp": 0.14770508, "routerloss_mlp": 0.0, "step": 1208, "time_per_iteration": 2.8140323162078857 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0113879, "balance_loss_mlp": 1.12392485, "diversity_loss_mlp": 0.0, "epoch": 0.23258945748364757, "flos": 610218261504.0, "grad_norm": 0.08612535310277082, "language_loss": 0.83098078, "learning_rate": 0.0008961862145444103, "loss": 0.84236872, "num_input_tokens_seen": 100426848, "router_z_loss_mlp": 0.1484375, "routerloss_mlp": 0.0, "step": 1209, "time_per_iteration": 2.7529945373535156 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01122935, "balance_loss_mlp": 1.10796285, "diversity_loss_mlp": 0.0, "epoch": 0.23278183916891113, "flos": 489651982848.0, "grad_norm": 0.08243119711445285, "language_loss": 0.85338795, "learning_rate": 0.0008959960851117059, "loss": 0.86461735, "num_input_tokens_seen": 100496176, "router_z_loss_mlp": 0.14953613, "routerloss_mlp": 0.0, "step": 1210, "time_per_iteration": 2.624340534210205 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108554, "balance_loss_mlp": 1.09396267, "diversity_loss_mlp": 0.0, "epoch": 0.23297422085417469, "flos": 511585403904.0, "grad_norm": 0.10596241027535934, "language_loss": 0.84048676, "learning_rate": 0.0008958058019396868, "loss": 0.85157233, "num_input_tokens_seen": 100575072, "router_z_loss_mlp": 0.14575195, "routerloss_mlp": 0.0, "step": 1211, "time_per_iteration": 2.8316566944122314 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112068, "balance_loss_mlp": 1.09751284, "diversity_loss_mlp": 0.0, "epoch": 0.23316660253943824, "flos": 546421552128.0, "grad_norm": 0.07651667178885936, "language_loss": 0.86494702, "learning_rate": 0.0008956153651022274, "loss": 0.8760677, "num_input_tokens_seen": 100648304, "router_z_loss_mlp": 0.14538574, "routerloss_mlp": 0.0, "step": 1212, "time_per_iteration": 2.684788465499878 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103634, "balance_loss_mlp": 1.08926892, "diversity_loss_mlp": 0.0, "epoch": 0.2333589842247018, "flos": 510256184832.0, "grad_norm": 0.07459915787800217, "language_loss": 0.83929688, "learning_rate": 0.0008954247746732618, "loss": 0.85033321, "num_input_tokens_seen": 100717616, "router_z_loss_mlp": 0.14355469, "routerloss_mlp": 0.0, "step": 1213, "time_per_iteration": 2.6184399127960205 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117524, "balance_loss_mlp": 1.10321903, "diversity_loss_mlp": 0.0, "epoch": 0.23355136590996536, "flos": 663148104192.0, "grad_norm": 0.08317009769115577, "language_loss": 0.90604293, "learning_rate": 0.0008952340307267837, "loss": 0.91721821, "num_input_tokens_seen": 100797056, "router_z_loss_mlp": 0.14306641, "routerloss_mlp": 0.0, "step": 1214, "time_per_iteration": 2.8993093967437744 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119644, "balance_loss_mlp": 1.10553002, "diversity_loss_mlp": 0.0, "epoch": 0.23374374759522892, "flos": 508457461248.0, "grad_norm": 0.09601716623847659, "language_loss": 0.83731341, "learning_rate": 0.0008950431333368468, "loss": 0.84850979, "num_input_tokens_seen": 100863632, "router_z_loss_mlp": 0.14123535, "routerloss_mlp": 0.0, "step": 1215, "time_per_iteration": 2.6151199340820312 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130858, "balance_loss_mlp": 1.11676729, "diversity_loss_mlp": 0.0, "epoch": 0.2339361292804925, "flos": 1294455499776.0, "grad_norm": 0.08049188450288745, "language_loss": 0.84623635, "learning_rate": 0.0008948520825775634, "loss": 0.8575449, "num_input_tokens_seen": 100950272, "router_z_loss_mlp": 0.14099121, "routerloss_mlp": 0.0, "step": 1216, "time_per_iteration": 3.645200490951538 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01123063, "balance_loss_mlp": 1.10880601, "diversity_loss_mlp": 0.0, "epoch": 0.23412851096575607, "flos": 705928006656.0, "grad_norm": 0.08038238822992319, "language_loss": 0.83978343, "learning_rate": 0.0008946608785231067, "loss": 0.85101402, "num_input_tokens_seen": 101031008, "router_z_loss_mlp": 0.1427002, "routerloss_mlp": 0.0, "step": 1217, "time_per_iteration": 2.871616840362549 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126329, "balance_loss_mlp": 1.11263156, "diversity_loss_mlp": 0.0, "epoch": 0.23432089265101963, "flos": 438263903232.0, "grad_norm": 0.07832391647543825, "language_loss": 0.84442961, "learning_rate": 0.0008944695212477084, "loss": 0.85569292, "num_input_tokens_seen": 101094688, "router_z_loss_mlp": 0.13708496, "routerloss_mlp": 0.0, "step": 1218, "time_per_iteration": 2.507080078125 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01123493, "balance_loss_mlp": 1.10867572, "diversity_loss_mlp": 0.0, "epoch": 0.2345132743362832, "flos": 480939918336.0, "grad_norm": 0.07420792055611987, "language_loss": 0.86334574, "learning_rate": 0.0008942780108256599, "loss": 0.87458062, "num_input_tokens_seen": 101163744, "router_z_loss_mlp": 0.14794922, "routerloss_mlp": 0.0, "step": 1219, "time_per_iteration": 2.6183433532714844 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01107778, "balance_loss_mlp": 1.09330583, "diversity_loss_mlp": 0.0, "epoch": 0.23470565602154675, "flos": 411453001728.0, "grad_norm": 0.07657909053901747, "language_loss": 0.86160946, "learning_rate": 0.0008940863473313121, "loss": 0.87268722, "num_input_tokens_seen": 101226480, "router_z_loss_mlp": 0.14465332, "routerloss_mlp": 0.0, "step": 1220, "time_per_iteration": 2.495164632797241 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01107377, "balance_loss_mlp": 1.09272623, "diversity_loss_mlp": 0.0, "epoch": 0.2348980377068103, "flos": 545450609664.0, "grad_norm": 0.07962638616920462, "language_loss": 0.87889743, "learning_rate": 0.0008938945308390756, "loss": 0.88997114, "num_input_tokens_seen": 101291824, "router_z_loss_mlp": 0.14648438, "routerloss_mlp": 0.0, "step": 1221, "time_per_iteration": 2.613927125930786 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097552, "balance_loss_mlp": 1.08298469, "diversity_loss_mlp": 0.0, "epoch": 0.23509041939207387, "flos": 575740389888.0, "grad_norm": 0.06679649396710063, "language_loss": 0.87179595, "learning_rate": 0.00089370256142342, "loss": 0.88277149, "num_input_tokens_seen": 101367216, "router_z_loss_mlp": 0.14550781, "routerloss_mlp": 0.0, "step": 1222, "time_per_iteration": 2.732208013534546 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094745, "balance_loss_mlp": 1.07952189, "diversity_loss_mlp": 0.0, "epoch": 0.23528280107733743, "flos": 588843177984.0, "grad_norm": 0.06680688140454344, "language_loss": 0.84810197, "learning_rate": 0.0008935104391588746, "loss": 0.85904944, "num_input_tokens_seen": 101438992, "router_z_loss_mlp": 0.15209961, "routerloss_mlp": 0.0, "step": 1223, "time_per_iteration": 2.7585461139678955 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094902, "balance_loss_mlp": 1.07917881, "diversity_loss_mlp": 0.0, "epoch": 0.235475182762601, "flos": 823328893440.0, "grad_norm": 0.07271030004651308, "language_loss": 0.83111542, "learning_rate": 0.0008933181641200276, "loss": 0.84206444, "num_input_tokens_seen": 101534464, "router_z_loss_mlp": 0.15710449, "routerloss_mlp": 0.0, "step": 1224, "time_per_iteration": 3.1440725326538086 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087445, "balance_loss_mlp": 1.07139981, "diversity_loss_mlp": 0.0, "epoch": 0.23566756444786457, "flos": 680164770816.0, "grad_norm": 0.07882513603721358, "language_loss": 0.85824931, "learning_rate": 0.0008931257363815271, "loss": 0.8691237, "num_input_tokens_seen": 101616496, "router_z_loss_mlp": 0.16040039, "routerloss_mlp": 0.0, "step": 1225, "time_per_iteration": 2.8887243270874023 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092595, "balance_loss_mlp": 1.07659674, "diversity_loss_mlp": 0.0, "epoch": 0.23585994613312813, "flos": 701811495936.0, "grad_norm": 0.09571789824401095, "language_loss": 0.89901638, "learning_rate": 0.0008929331560180798, "loss": 0.90994227, "num_input_tokens_seen": 101694496, "router_z_loss_mlp": 0.15991211, "routerloss_mlp": 0.0, "step": 1226, "time_per_iteration": 2.897155284881592 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095366, "balance_loss_mlp": 1.07965469, "diversity_loss_mlp": 0.0, "epoch": 0.2360523278183917, "flos": 524176842240.0, "grad_norm": 0.068724406385502, "language_loss": 0.90771782, "learning_rate": 0.0008927404231044525, "loss": 0.91867149, "num_input_tokens_seen": 101766160, "router_z_loss_mlp": 0.15698242, "routerloss_mlp": 0.0, "step": 1227, "time_per_iteration": 2.6892144680023193 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103179, "balance_loss_mlp": 1.08764625, "diversity_loss_mlp": 0.0, "epoch": 0.23624470950365525, "flos": 524310091776.0, "grad_norm": 0.06943954848997126, "language_loss": 0.81646705, "learning_rate": 0.0008925475377154703, "loss": 0.82749879, "num_input_tokens_seen": 101844160, "router_z_loss_mlp": 0.15515137, "routerloss_mlp": 0.0, "step": 1228, "time_per_iteration": 2.727325201034546 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01129278, "balance_loss_mlp": 1.11394727, "diversity_loss_mlp": 0.0, "epoch": 0.2364370911889188, "flos": 596811525120.0, "grad_norm": 0.0778889683705481, "language_loss": 0.8212285, "learning_rate": 0.0008923544999260183, "loss": 0.83252132, "num_input_tokens_seen": 101917968, "router_z_loss_mlp": 0.15307617, "routerloss_mlp": 0.0, "step": 1229, "time_per_iteration": 2.7520618438720703 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01146737, "balance_loss_mlp": 1.13194346, "diversity_loss_mlp": 0.0, "epoch": 0.23662947287418237, "flos": 756849153024.0, "grad_norm": 0.0853653064859127, "language_loss": 0.91254115, "learning_rate": 0.00089216130981104, "loss": 0.92400861, "num_input_tokens_seen": 101996880, "router_z_loss_mlp": 0.14794922, "routerloss_mlp": 0.0, "step": 1230, "time_per_iteration": 3.016228199005127 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138894, "balance_loss_mlp": 1.12364721, "diversity_loss_mlp": 0.0, "epoch": 0.23682185455944593, "flos": 546167162880.0, "grad_norm": 0.08048994442870243, "language_loss": 0.82752085, "learning_rate": 0.000891967967445539, "loss": 0.83890975, "num_input_tokens_seen": 102067936, "router_z_loss_mlp": 0.15222168, "routerloss_mlp": 0.0, "step": 1231, "time_per_iteration": 2.65736722946167 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126061, "balance_loss_mlp": 1.11135054, "diversity_loss_mlp": 0.0, "epoch": 0.2370142362447095, "flos": 662285818368.0, "grad_norm": 0.05909715635047166, "language_loss": 0.889099, "learning_rate": 0.0008917744729045772, "loss": 0.90035963, "num_input_tokens_seen": 102147552, "router_z_loss_mlp": 0.14685059, "routerloss_mlp": 0.0, "step": 1232, "time_per_iteration": 2.8686273097991943 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110871, "balance_loss_mlp": 1.0962795, "diversity_loss_mlp": 0.0, "epoch": 0.23720661792997308, "flos": 683670813696.0, "grad_norm": 0.08046733758331526, "language_loss": 0.83836448, "learning_rate": 0.0008915808262632757, "loss": 0.84947324, "num_input_tokens_seen": 102224480, "router_z_loss_mlp": 0.14562988, "routerloss_mlp": 0.0, "step": 1233, "time_per_iteration": 2.860353708267212 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00918962, "balance_loss_mlp": 1.60287488, "diversity_loss_mlp": 0.20008399, "epoch": 0.23739899961523664, "flos": 558909103104.0, "grad_norm": 0.03182006079144566, "language_loss": 0.93544835, "learning_rate": 0.0008913870275968148, "loss": 0.94463801, "num_input_tokens_seen": 102297392, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.017482, "step": 1234, "time_per_iteration": 2.7328829765319824 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095751, "balance_loss_mlp": 1.08008718, "diversity_loss_mlp": 0.0, "epoch": 0.2375913813005002, "flos": 889546904064.0, "grad_norm": 0.07195832826776788, "language_loss": 0.87503707, "learning_rate": 0.0008911930769804342, "loss": 0.88599461, "num_input_tokens_seen": 102386032, "router_z_loss_mlp": 0.15649414, "routerloss_mlp": 0.0, "step": 1235, "time_per_iteration": 3.2619638442993164 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091405, "balance_loss_mlp": 1.07551408, "diversity_loss_mlp": 0.0, "epoch": 0.23778376298576376, "flos": 641120707584.0, "grad_norm": 0.07148547933088874, "language_loss": 0.91313815, "learning_rate": 0.0008909989744894318, "loss": 0.92405218, "num_input_tokens_seen": 102463504, "router_z_loss_mlp": 0.15881348, "routerloss_mlp": 0.0, "step": 1236, "time_per_iteration": 2.8687992095947266 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080974, "balance_loss_mlp": 1.06530952, "diversity_loss_mlp": 0.0, "epoch": 0.23797614467102732, "flos": 616820313600.0, "grad_norm": 0.08021447901266163, "language_loss": 0.81662518, "learning_rate": 0.0008908047201991649, "loss": 0.8274349, "num_input_tokens_seen": 102529632, "router_z_loss_mlp": 0.15649414, "routerloss_mlp": 0.0, "step": 1237, "time_per_iteration": 2.737638235092163 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076608, "balance_loss_mlp": 1.06138515, "diversity_loss_mlp": 0.0, "epoch": 0.23816852635629088, "flos": 624245004288.0, "grad_norm": 0.07749899394714953, "language_loss": 0.86585152, "learning_rate": 0.0008906103141850502, "loss": 0.87661767, "num_input_tokens_seen": 102610192, "router_z_loss_mlp": 0.15197754, "routerloss_mlp": 0.0, "step": 1238, "time_per_iteration": 2.9184746742248535 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068848, "balance_loss_mlp": 1.05385113, "diversity_loss_mlp": 0.0, "epoch": 0.23836090804155444, "flos": 521431769088.0, "grad_norm": 0.10230617436374452, "language_loss": 0.88104367, "learning_rate": 0.0008904157565225621, "loss": 0.89173216, "num_input_tokens_seen": 102681216, "router_z_loss_mlp": 0.1496582, "routerloss_mlp": 0.0, "step": 1239, "time_per_iteration": 2.6396749019622803 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077038, "balance_loss_mlp": 1.06220865, "diversity_loss_mlp": 0.0, "epoch": 0.238553289726818, "flos": 1153991660544.0, "grad_norm": 0.10467557893696883, "language_loss": 0.81824136, "learning_rate": 0.000890221047287235, "loss": 0.82901168, "num_input_tokens_seen": 102777184, "router_z_loss_mlp": 0.14807129, "routerloss_mlp": 0.0, "step": 1240, "time_per_iteration": 3.496812582015991 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081371, "balance_loss_mlp": 1.06710172, "diversity_loss_mlp": 0.0, "epoch": 0.23874567141208156, "flos": 499861393920.0, "grad_norm": 0.09443583580909311, "language_loss": 0.91125917, "learning_rate": 0.0008900261865546615, "loss": 0.92207289, "num_input_tokens_seen": 102845744, "router_z_loss_mlp": 0.1427002, "routerloss_mlp": 0.0, "step": 1241, "time_per_iteration": 2.6527724266052246 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103977, "balance_loss_mlp": 1.0890398, "diversity_loss_mlp": 0.0, "epoch": 0.23893805309734514, "flos": 556934911488.0, "grad_norm": 0.08429957072104315, "language_loss": 0.84985352, "learning_rate": 0.0008898311744004936, "loss": 0.86089325, "num_input_tokens_seen": 102918064, "router_z_loss_mlp": 0.14916992, "routerloss_mlp": 0.0, "step": 1242, "time_per_iteration": 2.6740338802337646 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118763, "balance_loss_mlp": 1.10411179, "diversity_loss_mlp": 0.0, "epoch": 0.2391304347826087, "flos": 549270512640.0, "grad_norm": 0.07332762129893158, "language_loss": 0.86932802, "learning_rate": 0.0008896360109004414, "loss": 0.88051569, "num_input_tokens_seen": 102983920, "router_z_loss_mlp": 0.1463623, "routerloss_mlp": 0.0, "step": 1243, "time_per_iteration": 2.643489122390747 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01142112, "balance_loss_mlp": 1.12715125, "diversity_loss_mlp": 0.0, "epoch": 0.23932281646787226, "flos": 516050279424.0, "grad_norm": 0.09306092844590973, "language_loss": 0.84636557, "learning_rate": 0.0008894406961302742, "loss": 0.85778666, "num_input_tokens_seen": 103053328, "router_z_loss_mlp": 0.14941406, "routerloss_mlp": 0.0, "step": 1244, "time_per_iteration": 2.5876173973083496 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01150798, "balance_loss_mlp": 1.13590896, "diversity_loss_mlp": 0.0, "epoch": 0.23951519815313582, "flos": 743682124800.0, "grad_norm": 0.0838589606869783, "language_loss": 0.83944738, "learning_rate": 0.0008892452301658201, "loss": 0.85095537, "num_input_tokens_seen": 103128208, "router_z_loss_mlp": 0.14868164, "routerloss_mlp": 0.0, "step": 1245, "time_per_iteration": 2.928391218185425 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0116146, "balance_loss_mlp": 1.1460346, "diversity_loss_mlp": 0.0, "epoch": 0.23970757983839938, "flos": 554118257664.0, "grad_norm": 0.0736247551351698, "language_loss": 0.83299339, "learning_rate": 0.0008890496130829653, "loss": 0.84460801, "num_input_tokens_seen": 103197392, "router_z_loss_mlp": 0.1541748, "routerloss_mlp": 0.0, "step": 1246, "time_per_iteration": 2.6510462760925293 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00915571, "balance_loss_mlp": 1.59993446, "diversity_loss_mlp": 0.1987851, "epoch": 0.23989996152366294, "flos": 480655793664.0, "grad_norm": 0.03287481157446996, "language_loss": 0.85918486, "learning_rate": 0.0008888538449576555, "loss": 0.86834061, "num_input_tokens_seen": 103265328, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01621127, "step": 1247, "time_per_iteration": 2.5719456672668457 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01178279, "balance_loss_mlp": 1.16323447, "diversity_loss_mlp": 0.0, "epoch": 0.2400923432089265, "flos": 485310818304.0, "grad_norm": 0.10811715250715398, "language_loss": 0.83036304, "learning_rate": 0.0008886579258658944, "loss": 0.8421458, "num_input_tokens_seen": 103331632, "router_z_loss_mlp": 0.15014648, "routerloss_mlp": 0.0, "step": 1248, "time_per_iteration": 2.5736701488494873 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01148631, "balance_loss_mlp": 1.13341999, "diversity_loss_mlp": 0.0, "epoch": 0.24028472489419006, "flos": 623555615232.0, "grad_norm": 0.07868761607649298, "language_loss": 0.84717274, "learning_rate": 0.0008884618558837446, "loss": 0.85865903, "num_input_tokens_seen": 103405408, "router_z_loss_mlp": 0.15185547, "routerloss_mlp": 0.0, "step": 1249, "time_per_iteration": 2.8215761184692383 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00911764, "balance_loss_mlp": 1.59372783, "diversity_loss_mlp": 0.19720009, "epoch": 0.24047710657945365, "flos": 601602370560.0, "grad_norm": 0.03236174678929329, "language_loss": 0.8677094, "learning_rate": 0.0008882656350873273, "loss": 0.87682706, "num_input_tokens_seen": 103487216, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01629994, "step": 1250, "time_per_iteration": 2.885092258453369 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126022, "balance_loss_mlp": 1.11122799, "diversity_loss_mlp": 0.0, "epoch": 0.2406694882647172, "flos": 841558781952.0, "grad_norm": 0.08347743908005935, "language_loss": 0.87000573, "learning_rate": 0.0008880692635528219, "loss": 0.88126594, "num_input_tokens_seen": 103568640, "router_z_loss_mlp": 0.14782715, "routerloss_mlp": 0.0, "step": 1251, "time_per_iteration": 3.049070119857788 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106629, "balance_loss_mlp": 1.09177542, "diversity_loss_mlp": 0.0, "epoch": 0.24086186994998077, "flos": 527057736192.0, "grad_norm": 0.07406446185181008, "language_loss": 0.89514965, "learning_rate": 0.0008878727413564669, "loss": 0.90621597, "num_input_tokens_seen": 103640784, "router_z_loss_mlp": 0.14831543, "routerloss_mlp": 0.0, "step": 1252, "time_per_iteration": 2.734839677810669 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075719, "balance_loss_mlp": 1.06804204, "diversity_loss_mlp": 0.0, "epoch": 0.24105425163524433, "flos": 1338261378048.0, "grad_norm": 0.048930323133030355, "language_loss": 0.80135596, "learning_rate": 0.0008876760685745588, "loss": 0.81211317, "num_input_tokens_seen": 103865824, "router_z_loss_mlp": 0.07666016, "routerloss_mlp": 0.0, "step": 1253, "time_per_iteration": 4.854974031448364 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00873083, "balance_loss_mlp": 1.51531768, "diversity_loss_mlp": 0.19563958, "epoch": 0.24124663332050789, "flos": 614102404608.0, "grad_norm": 0.03648198852202315, "language_loss": 0.78763413, "learning_rate": 0.0008874792452834528, "loss": 0.7963649, "num_input_tokens_seen": 103939872, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01760404, "step": 1254, "time_per_iteration": 2.803690195083618 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090026, "balance_loss_mlp": 1.07530415, "diversity_loss_mlp": 0.0, "epoch": 0.24143901500577145, "flos": 575540328960.0, "grad_norm": 0.09659900556863026, "language_loss": 0.8729195, "learning_rate": 0.0008872822715595626, "loss": 0.88381982, "num_input_tokens_seen": 104011120, "router_z_loss_mlp": 0.14697266, "routerloss_mlp": 0.0, "step": 1255, "time_per_iteration": 2.657867670059204 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084204, "balance_loss_mlp": 1.06968451, "diversity_loss_mlp": 0.0, "epoch": 0.241631396691035, "flos": 495181776384.0, "grad_norm": 0.10497791491954662, "language_loss": 0.87333822, "learning_rate": 0.0008870851474793598, "loss": 0.88418031, "num_input_tokens_seen": 104077040, "router_z_loss_mlp": 0.14501953, "routerloss_mlp": 0.0, "step": 1256, "time_per_iteration": 2.5694568157196045 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083756, "balance_loss_mlp": 1.06920075, "diversity_loss_mlp": 0.0, "epoch": 0.24182377837629856, "flos": 636191470080.0, "grad_norm": 0.07331256259210016, "language_loss": 0.89243567, "learning_rate": 0.0008868878731193752, "loss": 0.90327322, "num_input_tokens_seen": 104150880, "router_z_loss_mlp": 0.14538574, "routerloss_mlp": 0.0, "step": 1257, "time_per_iteration": 2.829789400100708 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01086138, "balance_loss_mlp": 1.07158267, "diversity_loss_mlp": 0.0, "epoch": 0.24201616006156215, "flos": 515219927040.0, "grad_norm": 0.07236027639177293, "language_loss": 0.89720446, "learning_rate": 0.0008866904485561973, "loss": 0.90806586, "num_input_tokens_seen": 104223696, "router_z_loss_mlp": 0.14526367, "routerloss_mlp": 0.0, "step": 1258, "time_per_iteration": 2.731635570526123 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078524, "balance_loss_mlp": 1.06384969, "diversity_loss_mlp": 0.0, "epoch": 0.2422085417468257, "flos": 615144927744.0, "grad_norm": 0.0727569881861308, "language_loss": 0.83084273, "learning_rate": 0.000886492873866473, "loss": 0.84162796, "num_input_tokens_seen": 104301728, "router_z_loss_mlp": 0.14660645, "routerloss_mlp": 0.0, "step": 1259, "time_per_iteration": 2.8250575065612793 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080175, "balance_loss_mlp": 1.06528533, "diversity_loss_mlp": 0.0, "epoch": 0.24240092343208927, "flos": 585794156544.0, "grad_norm": 0.10762424055834904, "language_loss": 0.84672934, "learning_rate": 0.000886295149126908, "loss": 0.85753107, "num_input_tokens_seen": 104374480, "router_z_loss_mlp": 0.14868164, "routerloss_mlp": 0.0, "step": 1260, "time_per_iteration": 2.7148356437683105 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01086434, "balance_loss_mlp": 1.07181931, "diversity_loss_mlp": 0.0, "epoch": 0.24259330511735283, "flos": 762257806848.0, "grad_norm": 0.07159531524201106, "language_loss": 0.85693741, "learning_rate": 0.0008860972744142655, "loss": 0.86780179, "num_input_tokens_seen": 104452384, "router_z_loss_mlp": 0.14599609, "routerloss_mlp": 0.0, "step": 1261, "time_per_iteration": 2.931696653366089 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115009, "balance_loss_mlp": 1.10064411, "diversity_loss_mlp": 0.0, "epoch": 0.2427856868026164, "flos": 626878849536.0, "grad_norm": 0.065367920687613, "language_loss": 0.81639904, "learning_rate": 0.0008858992498053671, "loss": 0.82754916, "num_input_tokens_seen": 104532576, "router_z_loss_mlp": 0.14367676, "routerloss_mlp": 0.0, "step": 1262, "time_per_iteration": 2.846466541290283 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055704, "balance_loss_mlp": 1.04764521, "diversity_loss_mlp": 0.0, "epoch": 0.24297806848787995, "flos": 1511653985280.0, "grad_norm": 0.03374572714932058, "language_loss": 0.7658875, "learning_rate": 0.0008857010753770934, "loss": 0.77644455, "num_input_tokens_seen": 104765216, "router_z_loss_mlp": 0.08056641, "routerloss_mlp": 0.0, "step": 1263, "time_per_iteration": 4.882519006729126 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00872344, "balance_loss_mlp": 1.51226497, "diversity_loss_mlp": 0.19974959, "epoch": 0.2431704501731435, "flos": 541949336064.0, "grad_norm": 0.03166105856965055, "language_loss": 0.83409035, "learning_rate": 0.0008855027512063817, "loss": 0.84281385, "num_input_tokens_seen": 104836912, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01633644, "step": 1264, "time_per_iteration": 2.7414488792419434 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01185798, "balance_loss_mlp": 1.17132628, "diversity_loss_mlp": 0.0, "epoch": 0.24336283185840707, "flos": 523845729792.0, "grad_norm": 0.06261248257395001, "language_loss": 0.85949916, "learning_rate": 0.0008853042773702292, "loss": 0.8713572, "num_input_tokens_seen": 104909280, "router_z_loss_mlp": 0.14453125, "routerloss_mlp": 0.0, "step": 1265, "time_per_iteration": 2.695514440536499 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01196886, "balance_loss_mlp": 1.18234205, "diversity_loss_mlp": 0.0, "epoch": 0.24355521354367063, "flos": 537111502848.0, "grad_norm": 0.08760826562773598, "language_loss": 0.87981403, "learning_rate": 0.0008851056539456896, "loss": 0.89178288, "num_input_tokens_seen": 104982560, "router_z_loss_mlp": 0.14538574, "routerloss_mlp": 0.0, "step": 1266, "time_per_iteration": 2.6937575340270996 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0119913, "balance_loss_mlp": 1.18489647, "diversity_loss_mlp": 0.0, "epoch": 0.24374759522893422, "flos": 930461271552.0, "grad_norm": 0.07991839198753149, "language_loss": 0.81904382, "learning_rate": 0.0008849068810098755, "loss": 0.83103514, "num_input_tokens_seen": 105075056, "router_z_loss_mlp": 0.14221191, "routerloss_mlp": 0.0, "step": 1267, "time_per_iteration": 3.3067915439605713 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01174372, "balance_loss_mlp": 1.15992332, "diversity_loss_mlp": 0.0, "epoch": 0.24393997691419778, "flos": 427787619840.0, "grad_norm": 0.10499473220259715, "language_loss": 0.83550054, "learning_rate": 0.0008847079586399575, "loss": 0.84724426, "num_input_tokens_seen": 105137536, "router_z_loss_mlp": 0.14440918, "routerloss_mlp": 0.0, "step": 1268, "time_per_iteration": 2.4791157245635986 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0115106, "balance_loss_mlp": 1.13699341, "diversity_loss_mlp": 0.0, "epoch": 0.24413235859946134, "flos": 578853651456.0, "grad_norm": 0.07765469411987547, "language_loss": 0.86144567, "learning_rate": 0.0008845088869131641, "loss": 0.87295628, "num_input_tokens_seen": 105204848, "router_z_loss_mlp": 0.14074707, "routerloss_mlp": 0.0, "step": 1269, "time_per_iteration": 2.6733555793762207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111483, "balance_loss_mlp": 1.10053682, "diversity_loss_mlp": 0.0, "epoch": 0.2443247402847249, "flos": 529859708928.0, "grad_norm": 0.0888033537849515, "language_loss": 0.88898385, "learning_rate": 0.0008843096659067818, "loss": 0.90013218, "num_input_tokens_seen": 105273456, "router_z_loss_mlp": 0.14294434, "routerloss_mlp": 0.0, "step": 1270, "time_per_iteration": 2.6315910816192627 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111213, "balance_loss_mlp": 1.09708679, "diversity_loss_mlp": 0.0, "epoch": 0.24451712196998845, "flos": 696321349632.0, "grad_norm": 0.09475560383246978, "language_loss": 0.86565858, "learning_rate": 0.000884110295698155, "loss": 0.87677073, "num_input_tokens_seen": 105355488, "router_z_loss_mlp": 0.14135742, "routerloss_mlp": 0.0, "step": 1271, "time_per_iteration": 2.926668643951416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110218, "balance_loss_mlp": 1.08752966, "diversity_loss_mlp": 0.0, "epoch": 0.24470950365525201, "flos": 529832544768.0, "grad_norm": 0.09917556522455147, "language_loss": 0.85849231, "learning_rate": 0.0008839107763646861, "loss": 0.86951411, "num_input_tokens_seen": 105421568, "router_z_loss_mlp": 0.14624023, "routerloss_mlp": 0.0, "step": 1272, "time_per_iteration": 2.58022403717041 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110242, "balance_loss_mlp": 1.08751881, "diversity_loss_mlp": 0.0, "epoch": 0.24490188534051557, "flos": 491342049792.0, "grad_norm": 0.08783320449451974, "language_loss": 0.89941388, "learning_rate": 0.0008837111079838353, "loss": 0.91043806, "num_input_tokens_seen": 105493072, "router_z_loss_mlp": 0.14892578, "routerloss_mlp": 0.0, "step": 1273, "time_per_iteration": 2.6877150535583496 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111566, "balance_loss_mlp": 1.10096157, "diversity_loss_mlp": 0.0, "epoch": 0.24509426702577913, "flos": 474155057664.0, "grad_norm": 0.07640958054403056, "language_loss": 0.89671296, "learning_rate": 0.000883511290633121, "loss": 0.90786958, "num_input_tokens_seen": 105559840, "router_z_loss_mlp": 0.14672852, "routerloss_mlp": 0.0, "step": 1274, "time_per_iteration": 2.5929813385009766 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01123812, "balance_loss_mlp": 1.10898256, "diversity_loss_mlp": 0.0, "epoch": 0.24528664871104272, "flos": 550592391168.0, "grad_norm": 0.05814589763763208, "language_loss": 0.92211604, "learning_rate": 0.000883311324390119, "loss": 0.93335414, "num_input_tokens_seen": 105634448, "router_z_loss_mlp": 0.14807129, "routerloss_mlp": 0.0, "step": 1275, "time_per_iteration": 2.721343517303467 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138447, "balance_loss_mlp": 1.12315261, "diversity_loss_mlp": 0.0, "epoch": 0.24547903039630628, "flos": 825903641088.0, "grad_norm": 0.10098653640048322, "language_loss": 0.81237984, "learning_rate": 0.0008831112093324629, "loss": 0.82376432, "num_input_tokens_seen": 105711936, "router_z_loss_mlp": 0.15283203, "routerloss_mlp": 0.0, "step": 1276, "time_per_iteration": 3.066657543182373 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01148152, "balance_loss_mlp": 1.13266695, "diversity_loss_mlp": 0.0, "epoch": 0.24567141208156984, "flos": 591598162944.0, "grad_norm": 0.07328274291062464, "language_loss": 0.89255905, "learning_rate": 0.0008829109455378444, "loss": 0.90404058, "num_input_tokens_seen": 105780240, "router_z_loss_mlp": 0.15466309, "routerloss_mlp": 0.0, "step": 1277, "time_per_iteration": 2.6705071926116943 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01163573, "balance_loss_mlp": 1.14844561, "diversity_loss_mlp": 0.0, "epoch": 0.2458637937668334, "flos": 547874482176.0, "grad_norm": 0.08343231090098181, "language_loss": 0.86569774, "learning_rate": 0.000882710533084013, "loss": 0.87733346, "num_input_tokens_seen": 105849840, "router_z_loss_mlp": 0.15100098, "routerloss_mlp": 0.0, "step": 1278, "time_per_iteration": 2.632864236831665 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01152351, "balance_loss_mlp": 1.13783133, "diversity_loss_mlp": 0.0, "epoch": 0.24605617545209696, "flos": 515894635008.0, "grad_norm": 0.0729065811951457, "language_loss": 0.8929435, "learning_rate": 0.0008825099720487755, "loss": 0.90446699, "num_input_tokens_seen": 105921488, "router_z_loss_mlp": 0.14501953, "routerloss_mlp": 0.0, "step": 1279, "time_per_iteration": 2.7111196517944336 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00676302, "balance_loss_mlp": 1.12665224, "diversity_loss_mlp": 0.19835761, "epoch": 0.24624855713736052, "flos": 1511772553728.0, "grad_norm": 0.0027483074809680533, "language_loss": 0.7526114, "learning_rate": 0.0008823092625099967, "loss": 0.75937444, "num_input_tokens_seen": 106146816, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0137972, "step": 1280, "time_per_iteration": 4.88429594039917 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111003, "balance_loss_mlp": 1.10232449, "diversity_loss_mlp": 0.0, "epoch": 0.24644093882262408, "flos": 1527608305152.0, "grad_norm": 0.05615046205501133, "language_loss": 0.77944112, "learning_rate": 0.0008821084045455987, "loss": 0.79055113, "num_input_tokens_seen": 106361568, "router_z_loss_mlp": 0.08691406, "routerloss_mlp": 0.0, "step": 1281, "time_per_iteration": 4.752316236495972 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113823, "balance_loss_mlp": 1.09987593, "diversity_loss_mlp": 0.0, "epoch": 0.24663332050788764, "flos": 659118228480.0, "grad_norm": 0.08093958913819582, "language_loss": 0.89542687, "learning_rate": 0.0008819073982335619, "loss": 0.90656507, "num_input_tokens_seen": 106435296, "router_z_loss_mlp": 0.13964844, "routerloss_mlp": 0.0, "step": 1282, "time_per_iteration": 2.876927137374878 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110167, "balance_loss_mlp": 1.08783603, "diversity_loss_mlp": 0.0, "epoch": 0.24682570219315123, "flos": 541769098752.0, "grad_norm": 0.07169123109412263, "language_loss": 0.84362143, "learning_rate": 0.0008817062436519235, "loss": 0.8546381, "num_input_tokens_seen": 106507184, "router_z_loss_mlp": 0.13824463, "routerloss_mlp": 0.0, "step": 1283, "time_per_iteration": 2.6551387310028076 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0086846, "balance_loss_mlp": 1.5022366, "diversity_loss_mlp": 0.20048198, "epoch": 0.24701808387841478, "flos": 440695116288.0, "grad_norm": 0.033180516132009126, "language_loss": 0.89655471, "learning_rate": 0.0008815049408787788, "loss": 0.90523928, "num_input_tokens_seen": 106571472, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01710081, "step": 1284, "time_per_iteration": 2.5652830600738525 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100363, "balance_loss_mlp": 1.08698821, "diversity_loss_mlp": 0.0, "epoch": 0.24721046556367834, "flos": 468066926592.0, "grad_norm": 0.0762028673981185, "language_loss": 0.85473216, "learning_rate": 0.0008813034899922805, "loss": 0.86573577, "num_input_tokens_seen": 106638368, "router_z_loss_mlp": 0.1340332, "routerloss_mlp": 0.0, "step": 1285, "time_per_iteration": 2.549622058868408 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111306, "balance_loss_mlp": 1.09783578, "diversity_loss_mlp": 0.0, "epoch": 0.2474028472489419, "flos": 504427585536.0, "grad_norm": 0.11471388318643767, "language_loss": 0.89855313, "learning_rate": 0.0008811018910706387, "loss": 0.9096663, "num_input_tokens_seen": 106705312, "router_z_loss_mlp": 0.13500977, "routerloss_mlp": 0.0, "step": 1286, "time_per_iteration": 2.575176954269409 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117993, "balance_loss_mlp": 1.10453439, "diversity_loss_mlp": 0.0, "epoch": 0.24759522893420546, "flos": 479956492800.0, "grad_norm": 0.10517914532856759, "language_loss": 0.81922066, "learning_rate": 0.0008809001441921211, "loss": 0.83040059, "num_input_tokens_seen": 106778624, "router_z_loss_mlp": 0.13476562, "routerloss_mlp": 0.0, "step": 1287, "time_per_iteration": 2.732236862182617 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126757, "balance_loss_mlp": 1.1132865, "diversity_loss_mlp": 0.0, "epoch": 0.24778761061946902, "flos": 533706776064.0, "grad_norm": 0.1440229573277689, "language_loss": 0.85392761, "learning_rate": 0.0008806982494350528, "loss": 0.86519527, "num_input_tokens_seen": 106847744, "router_z_loss_mlp": 0.13476562, "routerloss_mlp": 0.0, "step": 1288, "time_per_iteration": 2.6544177532196045 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01168468, "balance_loss_mlp": 1.1549263, "diversity_loss_mlp": 0.0, "epoch": 0.24797999230473258, "flos": 559798553088.0, "grad_norm": 0.07192560701016996, "language_loss": 0.9021467, "learning_rate": 0.0008804962068778161, "loss": 0.91383135, "num_input_tokens_seen": 106927584, "router_z_loss_mlp": 0.13562012, "routerloss_mlp": 0.0, "step": 1289, "time_per_iteration": 2.8321304321289062 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01217004, "balance_loss_mlp": 1.20329499, "diversity_loss_mlp": 0.0, "epoch": 0.24817237398999614, "flos": 624225180672.0, "grad_norm": 0.08274381184261048, "language_loss": 0.81234664, "learning_rate": 0.0008802940165988511, "loss": 0.82451665, "num_input_tokens_seen": 107006656, "router_z_loss_mlp": 0.13720703, "routerloss_mlp": 0.0, "step": 1290, "time_per_iteration": 2.848726749420166 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01262968, "balance_loss_mlp": 1.24875808, "diversity_loss_mlp": 0.0, "epoch": 0.2483647556752597, "flos": 612281286144.0, "grad_norm": 0.09449787402071168, "language_loss": 0.88461435, "learning_rate": 0.000880091678676655, "loss": 0.8972441, "num_input_tokens_seen": 107084352, "router_z_loss_mlp": 0.14221191, "routerloss_mlp": 0.0, "step": 1291, "time_per_iteration": 2.802199363708496 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01279654, "balance_loss_mlp": 1.26553965, "diversity_loss_mlp": 0.0, "epoch": 0.2485571373605233, "flos": 583553092608.0, "grad_norm": 0.11843407890200246, "language_loss": 0.88870949, "learning_rate": 0.0008798891931897821, "loss": 0.90150601, "num_input_tokens_seen": 107158368, "router_z_loss_mlp": 0.14123535, "routerloss_mlp": 0.0, "step": 1292, "time_per_iteration": 2.7150259017944336 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00870403, "balance_loss_mlp": 1.50883341, "diversity_loss_mlp": 0.20002533, "epoch": 0.24874951904578685, "flos": 494749347840.0, "grad_norm": 0.035309457370921726, "language_loss": 0.84031773, "learning_rate": 0.0008796865602168447, "loss": 0.84902173, "num_input_tokens_seen": 107224256, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01597392, "step": 1293, "time_per_iteration": 2.5952000617980957 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01210957, "balance_loss_mlp": 1.19661582, "diversity_loss_mlp": 0.0, "epoch": 0.2489419007310504, "flos": 456174789120.0, "grad_norm": 0.07909897749306223, "language_loss": 0.88611919, "learning_rate": 0.0008794837798365115, "loss": 0.89822876, "num_input_tokens_seen": 107292720, "router_z_loss_mlp": 0.14331055, "routerloss_mlp": 0.0, "step": 1294, "time_per_iteration": 2.6257524490356445 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01167929, "balance_loss_mlp": 1.15246725, "diversity_loss_mlp": 0.0, "epoch": 0.24913428241631397, "flos": 485471232000.0, "grad_norm": 0.06704316740686254, "language_loss": 0.8866623, "learning_rate": 0.0008792808521275089, "loss": 0.89834166, "num_input_tokens_seen": 107368576, "router_z_loss_mlp": 0.15441895, "routerloss_mlp": 0.0, "step": 1295, "time_per_iteration": 2.7125115394592285 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01153488, "balance_loss_mlp": 1.13757372, "diversity_loss_mlp": 0.0, "epoch": 0.24932666410157753, "flos": 518906580480.0, "grad_norm": 0.08601952378824393, "language_loss": 0.87496305, "learning_rate": 0.0008790777771686206, "loss": 0.88649786, "num_input_tokens_seen": 107433856, "router_z_loss_mlp": 0.15905762, "routerloss_mlp": 0.0, "step": 1296, "time_per_iteration": 2.6131319999694824 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124706, "balance_loss_mlp": 1.10882747, "diversity_loss_mlp": 0.0, "epoch": 0.2495190457868411, "flos": 472603382784.0, "grad_norm": 0.0951042007575699, "language_loss": 0.8543523, "learning_rate": 0.0008788745550386872, "loss": 0.86559939, "num_input_tokens_seen": 107500944, "router_z_loss_mlp": 0.15869141, "routerloss_mlp": 0.0, "step": 1297, "time_per_iteration": 2.5590503215789795 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115503, "balance_loss_mlp": 1.09948111, "diversity_loss_mlp": 0.0, "epoch": 0.24971142747210465, "flos": 745886112768.0, "grad_norm": 0.07219065567928346, "language_loss": 0.80291975, "learning_rate": 0.0008786711858166063, "loss": 0.81407487, "num_input_tokens_seen": 107580000, "router_z_loss_mlp": 0.16015625, "routerloss_mlp": 0.0, "step": 1298, "time_per_iteration": 2.951768398284912 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00871436, "balance_loss_mlp": 1.51113367, "diversity_loss_mlp": 0.19870289, "epoch": 0.2499038091573682, "flos": 749557711872.0, "grad_norm": 0.03357842357877673, "language_loss": 0.83488023, "learning_rate": 0.0008784676695813332, "loss": 0.84359455, "num_input_tokens_seen": 107660384, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0165179, "step": 1299, "time_per_iteration": 2.985684871673584 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108985, "balance_loss_mlp": 1.07411456, "diversity_loss_mlp": 0.0, "epoch": 0.2500961908426318, "flos": 745060902912.0, "grad_norm": 0.07050099983107566, "language_loss": 0.84900999, "learning_rate": 0.0008782640064118796, "loss": 0.85990846, "num_input_tokens_seen": 107736320, "router_z_loss_mlp": 0.15722656, "routerloss_mlp": 0.0, "step": 1300, "time_per_iteration": 2.943368673324585 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01139161, "balance_loss_mlp": 1.13172245, "diversity_loss_mlp": 0.0, "epoch": 0.2502885725278953, "flos": 1417424334336.0, "grad_norm": 0.062054541004710057, "language_loss": 0.7618475, "learning_rate": 0.0008780601963873149, "loss": 0.77323914, "num_input_tokens_seen": 107972608, "router_z_loss_mlp": 0.07421875, "routerloss_mlp": 0.0, "step": 1301, "time_per_iteration": 4.975619316101074 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106013, "balance_loss_mlp": 1.09055138, "diversity_loss_mlp": 0.0, "epoch": 0.2504809542131589, "flos": 515215157760.0, "grad_norm": 0.08145949094764637, "language_loss": 0.86554521, "learning_rate": 0.0008778562395867648, "loss": 0.87660533, "num_input_tokens_seen": 108043312, "router_z_loss_mlp": 0.15441895, "routerloss_mlp": 0.0, "step": 1302, "time_per_iteration": 2.6318612098693848 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111342, "balance_loss_mlp": 1.09572554, "diversity_loss_mlp": 0.0, "epoch": 0.25067333589842244, "flos": 525819921408.0, "grad_norm": 0.0727542370097133, "language_loss": 0.84224409, "learning_rate": 0.0008776521360894127, "loss": 0.85335743, "num_input_tokens_seen": 108114144, "router_z_loss_mlp": 0.15600586, "routerloss_mlp": 0.0, "step": 1303, "time_per_iteration": 2.6512627601623535 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01029747, "balance_loss_mlp": 1.02259421, "diversity_loss_mlp": 0.0, "epoch": 0.25086571758368603, "flos": 1473897295872.0, "grad_norm": 0.02979233866947858, "language_loss": 0.78962064, "learning_rate": 0.0008774478859744984, "loss": 0.79991817, "num_input_tokens_seen": 108338720, "router_z_loss_mlp": 0.07128906, "routerloss_mlp": 0.0, "step": 1304, "time_per_iteration": 4.802467107772827 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112993, "balance_loss_mlp": 1.11518431, "diversity_loss_mlp": 0.0, "epoch": 0.2510580992689496, "flos": 528382185984.0, "grad_norm": 0.07060498048015267, "language_loss": 0.9057076, "learning_rate": 0.0008772434893213186, "loss": 0.91700697, "num_input_tokens_seen": 108405456, "router_z_loss_mlp": 0.14746094, "routerloss_mlp": 0.0, "step": 1305, "time_per_iteration": 2.601546049118042 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01137218, "balance_loss_mlp": 1.12251997, "diversity_loss_mlp": 0.0, "epoch": 0.25125048095421315, "flos": 517446309888.0, "grad_norm": 0.13797279723809866, "language_loss": 0.84362888, "learning_rate": 0.0008770389462092276, "loss": 0.85500103, "num_input_tokens_seen": 108474368, "router_z_loss_mlp": 0.14685059, "routerloss_mlp": 0.0, "step": 1306, "time_per_iteration": 2.626138210296631 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01141522, "balance_loss_mlp": 1.12685966, "diversity_loss_mlp": 0.0, "epoch": 0.25144286263947674, "flos": 620462177280.0, "grad_norm": 0.08471108342240245, "language_loss": 0.86803389, "learning_rate": 0.0008768342567176357, "loss": 0.87944913, "num_input_tokens_seen": 108548864, "router_z_loss_mlp": 0.1463623, "routerloss_mlp": 0.0, "step": 1307, "time_per_iteration": 2.8074796199798584 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0114159, "balance_loss_mlp": 1.12681937, "diversity_loss_mlp": 0.0, "epoch": 0.25163524432474027, "flos": 503799865344.0, "grad_norm": 0.07263390393133992, "language_loss": 0.90559924, "learning_rate": 0.0008766294209260107, "loss": 0.91701508, "num_input_tokens_seen": 108623072, "router_z_loss_mlp": 0.14746094, "routerloss_mlp": 0.0, "step": 1308, "time_per_iteration": 2.670790910720825 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01147465, "balance_loss_mlp": 1.13312435, "diversity_loss_mlp": 0.0, "epoch": 0.25182762601000386, "flos": 509072698368.0, "grad_norm": 0.07764888634730133, "language_loss": 0.91554916, "learning_rate": 0.0008764244389138767, "loss": 0.92702377, "num_input_tokens_seen": 108690128, "router_z_loss_mlp": 0.14331055, "routerloss_mlp": 0.0, "step": 1309, "time_per_iteration": 2.572793483734131 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01147476, "balance_loss_mlp": 1.13318276, "diversity_loss_mlp": 0.0, "epoch": 0.2520200076952674, "flos": 633896077824.0, "grad_norm": 0.09714227143719616, "language_loss": 0.82980847, "learning_rate": 0.000876219310760815, "loss": 0.8412832, "num_input_tokens_seen": 108770272, "router_z_loss_mlp": 0.14306641, "routerloss_mlp": 0.0, "step": 1310, "time_per_iteration": 2.8601791858673096 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01146968, "balance_loss_mlp": 1.13273418, "diversity_loss_mlp": 0.0, "epoch": 0.252212389380531, "flos": 494638119936.0, "grad_norm": 0.09648806821544922, "language_loss": 0.81436276, "learning_rate": 0.0008760140365464631, "loss": 0.82583249, "num_input_tokens_seen": 108840592, "router_z_loss_mlp": 0.14208984, "routerloss_mlp": 0.0, "step": 1311, "time_per_iteration": 2.599353790283203 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00870128, "balance_loss_mlp": 1.50605726, "diversity_loss_mlp": 0.20002663, "epoch": 0.2524047710657945, "flos": 490544004096.0, "grad_norm": 0.03529693250820236, "language_loss": 0.871418, "learning_rate": 0.0008758086163505156, "loss": 0.88011926, "num_input_tokens_seen": 108910064, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0170862, "step": 1312, "time_per_iteration": 2.6166832447052 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01163863, "balance_loss_mlp": 1.14953399, "diversity_loss_mlp": 0.0, "epoch": 0.2525971527510581, "flos": 647431294464.0, "grad_norm": 0.07147814499844148, "language_loss": 0.89267951, "learning_rate": 0.0008756030502527239, "loss": 0.90431809, "num_input_tokens_seen": 108986336, "router_z_loss_mlp": 0.14331055, "routerloss_mlp": 0.0, "step": 1313, "time_per_iteration": 2.8452062606811523 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01188075, "balance_loss_mlp": 1.17377019, "diversity_loss_mlp": 0.0, "epoch": 0.2527895344363217, "flos": 569266818048.0, "grad_norm": 0.09335955432973846, "language_loss": 0.90298462, "learning_rate": 0.0008753973383328954, "loss": 0.91486537, "num_input_tokens_seen": 109059712, "router_z_loss_mlp": 0.14294434, "routerloss_mlp": 0.0, "step": 1314, "time_per_iteration": 2.6988537311553955 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01165459, "balance_loss_mlp": 1.15108287, "diversity_loss_mlp": 0.0, "epoch": 0.2529819161215852, "flos": 514048923648.0, "grad_norm": 0.08872096542459323, "language_loss": 0.83944553, "learning_rate": 0.0008751914806708952, "loss": 0.85110015, "num_input_tokens_seen": 109127504, "router_z_loss_mlp": 0.14355469, "routerloss_mlp": 0.0, "step": 1315, "time_per_iteration": 2.6328680515289307 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01151493, "balance_loss_mlp": 1.1372478, "diversity_loss_mlp": 0.0, "epoch": 0.2531742978068488, "flos": 531253168128.0, "grad_norm": 0.09247066962171595, "language_loss": 0.81854099, "learning_rate": 0.0008749854773466439, "loss": 0.83005595, "num_input_tokens_seen": 109198080, "router_z_loss_mlp": 0.14233398, "routerloss_mlp": 0.0, "step": 1316, "time_per_iteration": 2.6708498001098633 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01134446, "balance_loss_mlp": 1.11980653, "diversity_loss_mlp": 0.0, "epoch": 0.25336667949211233, "flos": 596638628352.0, "grad_norm": 0.06992463478304738, "language_loss": 0.84568423, "learning_rate": 0.0008747793284401192, "loss": 0.85702872, "num_input_tokens_seen": 109268368, "router_z_loss_mlp": 0.14611816, "routerloss_mlp": 0.0, "step": 1317, "time_per_iteration": 2.70182204246521 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01120804, "balance_loss_mlp": 1.10560477, "diversity_loss_mlp": 0.0, "epoch": 0.2535590611773759, "flos": 602061963264.0, "grad_norm": 0.11229953955213261, "language_loss": 0.85994983, "learning_rate": 0.0008745730340313551, "loss": 0.87115788, "num_input_tokens_seen": 109344112, "router_z_loss_mlp": 0.1517334, "routerloss_mlp": 0.0, "step": 1318, "time_per_iteration": 2.8026556968688965 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119726, "balance_loss_mlp": 1.1048007, "diversity_loss_mlp": 0.0, "epoch": 0.25375144286263945, "flos": 495327508992.0, "grad_norm": 0.0843917818222923, "language_loss": 0.84519732, "learning_rate": 0.0008743665942004422, "loss": 0.85639453, "num_input_tokens_seen": 109414112, "router_z_loss_mlp": 0.14904785, "routerloss_mlp": 0.0, "step": 1319, "time_per_iteration": 2.6717073917388916 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01120645, "balance_loss_mlp": 1.10569644, "diversity_loss_mlp": 0.0, "epoch": 0.25394382454790304, "flos": 512470084608.0, "grad_norm": 0.06860607652829093, "language_loss": 0.92769039, "learning_rate": 0.0008741600090275277, "loss": 0.93889689, "num_input_tokens_seen": 109484336, "router_z_loss_mlp": 0.14916992, "routerloss_mlp": 0.0, "step": 1320, "time_per_iteration": 2.6251981258392334 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01120587, "balance_loss_mlp": 1.10530448, "diversity_loss_mlp": 0.0, "epoch": 0.25413620623316663, "flos": 959038589952.0, "grad_norm": 0.09643257369734548, "language_loss": 0.8425917, "learning_rate": 0.0008739532785928151, "loss": 0.85379755, "num_input_tokens_seen": 109590128, "router_z_loss_mlp": 0.15258789, "routerloss_mlp": 0.0, "step": 1321, "time_per_iteration": 3.4925267696380615 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101061, "balance_loss_mlp": 1.09305024, "diversity_loss_mlp": 0.0, "epoch": 0.25432858791843016, "flos": 1577283922944.0, "grad_norm": 0.04547815076873398, "language_loss": 0.74893582, "learning_rate": 0.0008737464029765639, "loss": 0.75994641, "num_input_tokens_seen": 109816592, "router_z_loss_mlp": 0.08007812, "routerloss_mlp": 0.0, "step": 1322, "time_per_iteration": 4.8446879386901855 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0085354, "balance_loss_mlp": 1.4814328, "diversity_loss_mlp": 0.19370571, "epoch": 0.25452096960369375, "flos": 583802712576.0, "grad_norm": 0.036800523279172735, "language_loss": 0.82844102, "learning_rate": 0.0008735393822590908, "loss": 0.83697641, "num_input_tokens_seen": 109890464, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01597124, "step": 1323, "time_per_iteration": 2.7354650497436523 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01174586, "balance_loss_mlp": 1.16032863, "diversity_loss_mlp": 0.0, "epoch": 0.2547133512889573, "flos": 508603193856.0, "grad_norm": 0.08280852347492981, "language_loss": 0.87442601, "learning_rate": 0.0008733322165207681, "loss": 0.88617194, "num_input_tokens_seen": 109963408, "router_z_loss_mlp": 0.14245605, "routerloss_mlp": 0.0, "step": 1324, "time_per_iteration": 2.6581695079803467 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0120021, "balance_loss_mlp": 1.18529749, "diversity_loss_mlp": 0.0, "epoch": 0.25490573297422087, "flos": 782619729408.0, "grad_norm": 0.0779912319299164, "language_loss": 0.8296451, "learning_rate": 0.0008731249058420247, "loss": 0.84164721, "num_input_tokens_seen": 110048800, "router_z_loss_mlp": 0.14892578, "routerloss_mlp": 0.0, "step": 1325, "time_per_iteration": 3.0674960613250732 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01203892, "balance_loss_mlp": 1.18865728, "diversity_loss_mlp": 0.0, "epoch": 0.2550981146594844, "flos": 509878084608.0, "grad_norm": 0.10695670124077197, "language_loss": 0.90080667, "learning_rate": 0.0008729174503033459, "loss": 0.91284555, "num_input_tokens_seen": 110118096, "router_z_loss_mlp": 0.15209961, "routerloss_mlp": 0.0, "step": 1326, "time_per_iteration": 2.6511192321777344 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01188403, "balance_loss_mlp": 1.17334652, "diversity_loss_mlp": 0.0, "epoch": 0.255290496344748, "flos": 676673409024.0, "grad_norm": 0.10125548093505272, "language_loss": 0.82427752, "learning_rate": 0.0008727098499852728, "loss": 0.83616149, "num_input_tokens_seen": 110190160, "router_z_loss_mlp": 0.15026855, "routerloss_mlp": 0.0, "step": 1327, "time_per_iteration": 2.833803415298462 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01150318, "balance_loss_mlp": 1.13529778, "diversity_loss_mlp": 0.0, "epoch": 0.2554828780300115, "flos": 537815572992.0, "grad_norm": 0.08478455973869617, "language_loss": 0.89778203, "learning_rate": 0.0008725021049684034, "loss": 0.90928519, "num_input_tokens_seen": 110268000, "router_z_loss_mlp": 0.15002441, "routerloss_mlp": 0.0, "step": 1328, "time_per_iteration": 2.7405433654785156 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01116795, "balance_loss_mlp": 1.10194123, "diversity_loss_mlp": 0.0, "epoch": 0.2556752597152751, "flos": 824186409984.0, "grad_norm": 0.07099770943741918, "language_loss": 0.83078361, "learning_rate": 0.000872294215333391, "loss": 0.84195161, "num_input_tokens_seen": 110354816, "router_z_loss_mlp": 0.14831543, "routerloss_mlp": 0.0, "step": 1329, "time_per_iteration": 3.219834089279175 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099158, "balance_loss_mlp": 1.08430433, "diversity_loss_mlp": 0.0, "epoch": 0.2558676414005387, "flos": 570791328768.0, "grad_norm": 0.06913408205057751, "language_loss": 0.82662833, "learning_rate": 0.0008720861811609457, "loss": 0.8376199, "num_input_tokens_seen": 110427968, "router_z_loss_mlp": 0.1484375, "routerloss_mlp": 0.0, "step": 1330, "time_per_iteration": 2.753122329711914 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01096587, "balance_loss_mlp": 1.0816741, "diversity_loss_mlp": 0.0, "epoch": 0.2560600230858022, "flos": 486684453888.0, "grad_norm": 0.0919113566921475, "language_loss": 0.83719599, "learning_rate": 0.0008718780025318338, "loss": 0.84816188, "num_input_tokens_seen": 110501184, "router_z_loss_mlp": 0.14880371, "routerloss_mlp": 0.0, "step": 1331, "time_per_iteration": 2.724808692932129 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01107534, "balance_loss_mlp": 1.09296656, "diversity_loss_mlp": 0.0, "epoch": 0.2562524047710658, "flos": 513122397696.0, "grad_norm": 0.09880415123515712, "language_loss": 0.83982158, "learning_rate": 0.0008716696795268771, "loss": 0.85089689, "num_input_tokens_seen": 110573008, "router_z_loss_mlp": 0.14538574, "routerloss_mlp": 0.0, "step": 1332, "time_per_iteration": 2.718421220779419 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098797, "balance_loss_mlp": 1.08430111, "diversity_loss_mlp": 0.0, "epoch": 0.25644478645632934, "flos": 634820032512.0, "grad_norm": 0.15208681676824193, "language_loss": 0.85333431, "learning_rate": 0.0008714612122269538, "loss": 0.8643223, "num_input_tokens_seen": 110646704, "router_z_loss_mlp": 0.14489746, "routerloss_mlp": 0.0, "step": 1333, "time_per_iteration": 2.877823829650879 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01120258, "balance_loss_mlp": 1.10586989, "diversity_loss_mlp": 0.0, "epoch": 0.25663716814159293, "flos": 436591088640.0, "grad_norm": 0.07756137703605612, "language_loss": 0.89334106, "learning_rate": 0.0008712526007129982, "loss": 0.90454364, "num_input_tokens_seen": 110712208, "router_z_loss_mlp": 0.1439209, "routerloss_mlp": 0.0, "step": 1334, "time_per_iteration": 2.561842441558838 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01155014, "balance_loss_mlp": 1.14101923, "diversity_loss_mlp": 0.0, "epoch": 0.25682954982685646, "flos": 498161415168.0, "grad_norm": 0.12724628219842446, "language_loss": 0.90676123, "learning_rate": 0.0008710438450660003, "loss": 0.91831136, "num_input_tokens_seen": 110783936, "router_z_loss_mlp": 0.14013672, "routerloss_mlp": 0.0, "step": 1335, "time_per_iteration": 2.6618270874023438 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01199277, "balance_loss_mlp": 1.18486404, "diversity_loss_mlp": 0.0, "epoch": 0.25702193151212005, "flos": 457701871104.0, "grad_norm": 0.10895723532104484, "language_loss": 0.87596953, "learning_rate": 0.0008708349453670064, "loss": 0.88796222, "num_input_tokens_seen": 110848560, "router_z_loss_mlp": 0.14404297, "routerloss_mlp": 0.0, "step": 1336, "time_per_iteration": 2.5121865272521973 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01195197, "balance_loss_mlp": 1.18032002, "diversity_loss_mlp": 0.0, "epoch": 0.2572143131973836, "flos": 598281707520.0, "grad_norm": 0.10227195785495524, "language_loss": 0.91035736, "learning_rate": 0.0008706259016971185, "loss": 0.92230934, "num_input_tokens_seen": 110922672, "router_z_loss_mlp": 0.14855957, "routerloss_mlp": 0.0, "step": 1337, "time_per_iteration": 2.7760090827941895 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01189061, "balance_loss_mlp": 1.17414773, "diversity_loss_mlp": 0.0, "epoch": 0.25740669488264717, "flos": 698308024320.0, "grad_norm": 0.12625436277937716, "language_loss": 0.83095431, "learning_rate": 0.0008704167141374944, "loss": 0.84284496, "num_input_tokens_seen": 110995456, "router_z_loss_mlp": 0.14892578, "routerloss_mlp": 0.0, "step": 1338, "time_per_iteration": 2.824122428894043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01146224, "balance_loss_mlp": 1.13107228, "diversity_loss_mlp": 0.0, "epoch": 0.25759907656791076, "flos": 502379241984.0, "grad_norm": 0.0801465901926633, "language_loss": 0.88427222, "learning_rate": 0.0008702073827693482, "loss": 0.89573455, "num_input_tokens_seen": 111069568, "router_z_loss_mlp": 0.15148926, "routerloss_mlp": 0.0, "step": 1339, "time_per_iteration": 2.708488941192627 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101183, "balance_loss_mlp": 1.0865202, "diversity_loss_mlp": 0.0, "epoch": 0.2577914582531743, "flos": 773880500736.0, "grad_norm": 0.07445900988257396, "language_loss": 0.88514435, "learning_rate": 0.0008699979076739494, "loss": 0.89615613, "num_input_tokens_seen": 111142608, "router_z_loss_mlp": 0.14660645, "routerloss_mlp": 0.0, "step": 1340, "time_per_iteration": 2.960650682449341 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085219, "balance_loss_mlp": 1.07054412, "diversity_loss_mlp": 0.0, "epoch": 0.2579838399384379, "flos": 459666150912.0, "grad_norm": 0.09041758143252471, "language_loss": 0.88622832, "learning_rate": 0.0008697882889326234, "loss": 0.89708054, "num_input_tokens_seen": 111206336, "router_z_loss_mlp": 0.14660645, "routerloss_mlp": 0.0, "step": 1341, "time_per_iteration": 2.5199689865112305 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094608, "balance_loss_mlp": 1.08043432, "diversity_loss_mlp": 0.0, "epoch": 0.2581762216237014, "flos": 569185325568.0, "grad_norm": 0.08157938691300957, "language_loss": 0.86840844, "learning_rate": 0.0008695785266267515, "loss": 0.87935448, "num_input_tokens_seen": 111276736, "router_z_loss_mlp": 0.14172363, "routerloss_mlp": 0.0, "step": 1342, "time_per_iteration": 2.6833419799804688 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0089859, "balance_loss_mlp": 1.56664371, "diversity_loss_mlp": 0.19803861, "epoch": 0.258368603308965, "flos": 604201711104.0, "grad_norm": 0.03344075262961686, "language_loss": 0.83491886, "learning_rate": 0.0008693686208377704, "loss": 0.84390479, "num_input_tokens_seen": 111353856, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01624843, "step": 1343, "time_per_iteration": 2.8157622814178467 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101399, "balance_loss_mlp": 1.08711743, "diversity_loss_mlp": 0.0, "epoch": 0.2585609849942285, "flos": 491460618240.0, "grad_norm": 0.07460013341605923, "language_loss": 0.89022982, "learning_rate": 0.0008691585716471733, "loss": 0.90124375, "num_input_tokens_seen": 111424960, "router_z_loss_mlp": 0.1427002, "routerloss_mlp": 0.0, "step": 1344, "time_per_iteration": 2.6386232376098633 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111157, "balance_loss_mlp": 1.09707415, "diversity_loss_mlp": 0.0, "epoch": 0.2587533666794921, "flos": 640755090432.0, "grad_norm": 0.08548738123283665, "language_loss": 0.85822487, "learning_rate": 0.0008689483791365079, "loss": 0.86934054, "num_input_tokens_seen": 111505248, "router_z_loss_mlp": 0.14489746, "routerloss_mlp": 0.0, "step": 1345, "time_per_iteration": 2.831817626953125 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112096, "balance_loss_mlp": 1.10685778, "diversity_loss_mlp": 0.0, "epoch": 0.2589457483647557, "flos": 576849724416.0, "grad_norm": 0.07218857890204664, "language_loss": 0.89327282, "learning_rate": 0.0008687380433873786, "loss": 0.90448248, "num_input_tokens_seen": 111581936, "router_z_loss_mlp": 0.14111328, "routerloss_mlp": 0.0, "step": 1346, "time_per_iteration": 2.8322408199310303 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01139286, "balance_loss_mlp": 1.1251955, "diversity_loss_mlp": 0.0, "epoch": 0.25913813005001923, "flos": 535424007168.0, "grad_norm": 0.07612070672802876, "language_loss": 0.82638776, "learning_rate": 0.0008685275644814448, "loss": 0.83778065, "num_input_tokens_seen": 111651456, "router_z_loss_mlp": 0.14099121, "routerloss_mlp": 0.0, "step": 1347, "time_per_iteration": 2.689772367477417 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0116224, "balance_loss_mlp": 1.14764857, "diversity_loss_mlp": 0.0, "epoch": 0.2593305117352828, "flos": 721039491072.0, "grad_norm": 0.07884944678342334, "language_loss": 0.84390515, "learning_rate": 0.0008683169425004216, "loss": 0.85552752, "num_input_tokens_seen": 111731712, "router_z_loss_mlp": 0.14587402, "routerloss_mlp": 0.0, "step": 1348, "time_per_iteration": 2.895153760910034 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01159732, "balance_loss_mlp": 1.14511704, "diversity_loss_mlp": 0.0, "epoch": 0.25952289342054635, "flos": 710096274432.0, "grad_norm": 0.10354145261803285, "language_loss": 0.83314335, "learning_rate": 0.0008681061775260799, "loss": 0.84474063, "num_input_tokens_seen": 111800752, "router_z_loss_mlp": 0.14599609, "routerloss_mlp": 0.0, "step": 1349, "time_per_iteration": 2.850862503051758 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01166024, "balance_loss_mlp": 1.15118265, "diversity_loss_mlp": 0.0, "epoch": 0.25971527510580994, "flos": 455920399872.0, "grad_norm": 0.08416928552821445, "language_loss": 0.9214983, "learning_rate": 0.0008678952696402458, "loss": 0.93315852, "num_input_tokens_seen": 111866752, "router_z_loss_mlp": 0.14819336, "routerloss_mlp": 0.0, "step": 1350, "time_per_iteration": 2.525019884109497 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01153353, "balance_loss_mlp": 1.13848734, "diversity_loss_mlp": 0.0, "epoch": 0.25990765679107347, "flos": 612528334848.0, "grad_norm": 0.07397225666721696, "language_loss": 0.86554277, "learning_rate": 0.000867684218924801, "loss": 0.87707639, "num_input_tokens_seen": 111951328, "router_z_loss_mlp": 0.1484375, "routerloss_mlp": 0.0, "step": 1351, "time_per_iteration": 2.8780648708343506 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083238, "balance_loss_mlp": 1.07517958, "diversity_loss_mlp": 0.0, "epoch": 0.26010003847633706, "flos": 1537963075584.0, "grad_norm": 0.0438698963901256, "language_loss": 0.78947091, "learning_rate": 0.0008674730254616827, "loss": 0.80030328, "num_input_tokens_seen": 112182272, "router_z_loss_mlp": 0.08056641, "routerloss_mlp": 0.0, "step": 1352, "time_per_iteration": 4.916059255599976 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01132931, "balance_loss_mlp": 1.11807716, "diversity_loss_mlp": 0.0, "epoch": 0.2602924201616006, "flos": 716265897984.0, "grad_norm": 0.06358739416567256, "language_loss": 0.85154414, "learning_rate": 0.0008672616893328834, "loss": 0.86287344, "num_input_tokens_seen": 112261760, "router_z_loss_mlp": 0.14831543, "routerloss_mlp": 0.0, "step": 1353, "time_per_iteration": 2.9301464557647705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01120208, "balance_loss_mlp": 1.10545015, "diversity_loss_mlp": 0.0, "epoch": 0.2604848018468642, "flos": 643529899008.0, "grad_norm": 0.0804298790611747, "language_loss": 0.89736795, "learning_rate": 0.0008670502106204512, "loss": 0.90857005, "num_input_tokens_seen": 112339136, "router_z_loss_mlp": 0.14733887, "routerloss_mlp": 0.0, "step": 1354, "time_per_iteration": 2.8392651081085205 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121529, "balance_loss_mlp": 1.10672283, "diversity_loss_mlp": 0.0, "epoch": 0.26067718353212777, "flos": 517033704960.0, "grad_norm": 0.08121830869095954, "language_loss": 0.81676221, "learning_rate": 0.0008668385894064892, "loss": 0.82797754, "num_input_tokens_seen": 112409872, "router_z_loss_mlp": 0.14770508, "routerloss_mlp": 0.0, "step": 1355, "time_per_iteration": 2.632744550704956 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115185, "balance_loss_mlp": 1.10095191, "diversity_loss_mlp": 0.0, "epoch": 0.2608695652173913, "flos": 822733479936.0, "grad_norm": 0.0871855710564252, "language_loss": 0.88984954, "learning_rate": 0.0008666268257731562, "loss": 0.90100139, "num_input_tokens_seen": 112495616, "router_z_loss_mlp": 0.14233398, "routerloss_mlp": 0.0, "step": 1356, "time_per_iteration": 3.0961363315582275 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01132093, "balance_loss_mlp": 1.11785948, "diversity_loss_mlp": 0.0, "epoch": 0.2610619469026549, "flos": 1007850097152.0, "grad_norm": 0.08548634624367135, "language_loss": 0.8594982, "learning_rate": 0.0008664149198026662, "loss": 0.87081909, "num_input_tokens_seen": 112575168, "router_z_loss_mlp": 0.14245605, "routerloss_mlp": 0.0, "step": 1357, "time_per_iteration": 3.2423956394195557 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01133945, "balance_loss_mlp": 1.12039137, "diversity_loss_mlp": 0.0, "epoch": 0.2612543285879184, "flos": 536782961664.0, "grad_norm": 0.09109654485188295, "language_loss": 0.88802171, "learning_rate": 0.0008662028715772883, "loss": 0.89936113, "num_input_tokens_seen": 112648480, "router_z_loss_mlp": 0.13574219, "routerloss_mlp": 0.0, "step": 1358, "time_per_iteration": 2.619495153427124 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138578, "balance_loss_mlp": 1.12476182, "diversity_loss_mlp": 0.0, "epoch": 0.261446710273182, "flos": 519420501504.0, "grad_norm": 0.07135790209188476, "language_loss": 0.85816395, "learning_rate": 0.0008659906811793467, "loss": 0.86954975, "num_input_tokens_seen": 112719856, "router_z_loss_mlp": 0.13842773, "routerloss_mlp": 0.0, "step": 1359, "time_per_iteration": 2.6752817630767822 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135637, "balance_loss_mlp": 1.12191582, "diversity_loss_mlp": 0.0, "epoch": 0.26163909195844554, "flos": 583259056128.0, "grad_norm": 0.07783428421444573, "language_loss": 0.89649427, "learning_rate": 0.0008657783486912215, "loss": 0.90785068, "num_input_tokens_seen": 112795088, "router_z_loss_mlp": 0.1373291, "routerloss_mlp": 0.0, "step": 1360, "time_per_iteration": 2.770136594772339 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00918859, "balance_loss_mlp": 1.60386825, "diversity_loss_mlp": 0.20058532, "epoch": 0.2618314736437091, "flos": 958762179072.0, "grad_norm": 0.03438194549161764, "language_loss": 0.90315008, "learning_rate": 0.0008655658741953472, "loss": 0.91233867, "num_input_tokens_seen": 112879888, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01663268, "step": 1361, "time_per_iteration": 3.239567518234253 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117406, "balance_loss_mlp": 1.10352993, "diversity_loss_mlp": 0.0, "epoch": 0.26202385532897265, "flos": 574803952128.0, "grad_norm": 0.053733033776962646, "language_loss": 0.88311911, "learning_rate": 0.0008653532577742136, "loss": 0.89429319, "num_input_tokens_seen": 112952208, "router_z_loss_mlp": 0.13891602, "routerloss_mlp": 0.0, "step": 1362, "time_per_iteration": 2.6912107467651367 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111717, "balance_loss_mlp": 1.09805584, "diversity_loss_mlp": 0.0, "epoch": 0.26221623701423624, "flos": 445471280640.0, "grad_norm": 0.07456283347469675, "language_loss": 0.8687824, "learning_rate": 0.0008651404995103659, "loss": 0.87989956, "num_input_tokens_seen": 113017472, "router_z_loss_mlp": 0.13671875, "routerloss_mlp": 0.0, "step": 1363, "time_per_iteration": 2.5554919242858887 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106371, "balance_loss_mlp": 1.09212554, "diversity_loss_mlp": 0.0, "epoch": 0.26240861869949983, "flos": 535718043648.0, "grad_norm": 0.0735216597505126, "language_loss": 0.87311852, "learning_rate": 0.0008649275994864041, "loss": 0.88418221, "num_input_tokens_seen": 113090000, "router_z_loss_mlp": 0.14245605, "routerloss_mlp": 0.0, "step": 1364, "time_per_iteration": 2.7228429317474365 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01109936, "balance_loss_mlp": 1.0955832, "diversity_loss_mlp": 0.0, "epoch": 0.26260100038476336, "flos": 565249052160.0, "grad_norm": 0.06423000395680191, "language_loss": 0.83767593, "learning_rate": 0.0008647145577849834, "loss": 0.84877527, "num_input_tokens_seen": 113169424, "router_z_loss_mlp": 0.14355469, "routerloss_mlp": 0.0, "step": 1365, "time_per_iteration": 2.8194234371185303 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110395, "balance_loss_mlp": 1.09573257, "diversity_loss_mlp": 0.0, "epoch": 0.26279338207002695, "flos": 613059508224.0, "grad_norm": 0.0636918785190987, "language_loss": 0.82912111, "learning_rate": 0.0008645013744888139, "loss": 0.8402251, "num_input_tokens_seen": 113256752, "router_z_loss_mlp": 0.14660645, "routerloss_mlp": 0.0, "step": 1366, "time_per_iteration": 2.9121909141540527 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106528, "balance_loss_mlp": 1.09266424, "diversity_loss_mlp": 0.0, "epoch": 0.2629857637552905, "flos": 522832568832.0, "grad_norm": 0.07268525177684865, "language_loss": 0.87255573, "learning_rate": 0.0008642880496806607, "loss": 0.88362104, "num_input_tokens_seen": 113330512, "router_z_loss_mlp": 0.13879395, "routerloss_mlp": 0.0, "step": 1367, "time_per_iteration": 2.7527663707733154 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117256, "balance_loss_mlp": 1.1027844, "diversity_loss_mlp": 0.0, "epoch": 0.26317814544055407, "flos": 534549238272.0, "grad_norm": 0.06883104565378229, "language_loss": 0.84193766, "learning_rate": 0.0008640745834433437, "loss": 0.85311019, "num_input_tokens_seen": 113409088, "router_z_loss_mlp": 0.14453125, "routerloss_mlp": 0.0, "step": 1368, "time_per_iteration": 2.7203800678253174 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01114698, "balance_loss_mlp": 1.10065532, "diversity_loss_mlp": 0.0, "epoch": 0.2633705271258176, "flos": 555543650304.0, "grad_norm": 0.0718323039568536, "language_loss": 0.87083656, "learning_rate": 0.000863860975859738, "loss": 0.88198352, "num_input_tokens_seen": 113486624, "router_z_loss_mlp": 0.14050293, "routerloss_mlp": 0.0, "step": 1369, "time_per_iteration": 2.9021553993225098 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01116648, "balance_loss_mlp": 1.10278392, "diversity_loss_mlp": 0.0, "epoch": 0.2635629088110812, "flos": 552401026560.0, "grad_norm": 0.08463505288724613, "language_loss": 0.88568735, "learning_rate": 0.0008636472270127733, "loss": 0.8968538, "num_input_tokens_seen": 113555776, "router_z_loss_mlp": 0.13867188, "routerloss_mlp": 0.0, "step": 1370, "time_per_iteration": 2.6336748600006104 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118916, "balance_loss_mlp": 1.10440779, "diversity_loss_mlp": 0.0, "epoch": 0.2637552904963448, "flos": 455984640000.0, "grad_norm": 0.08505114845208346, "language_loss": 0.90530956, "learning_rate": 0.0008634333369854345, "loss": 0.91649872, "num_input_tokens_seen": 113624208, "router_z_loss_mlp": 0.14501953, "routerloss_mlp": 0.0, "step": 1371, "time_per_iteration": 2.585775136947632 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01122621, "balance_loss_mlp": 1.10868549, "diversity_loss_mlp": 0.0, "epoch": 0.2639476721816083, "flos": 613128890880.0, "grad_norm": 0.07138701063901956, "language_loss": 0.87574148, "learning_rate": 0.0008632193058607608, "loss": 0.88696772, "num_input_tokens_seen": 113698544, "router_z_loss_mlp": 0.13952637, "routerloss_mlp": 0.0, "step": 1372, "time_per_iteration": 2.719151735305786 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124687, "balance_loss_mlp": 1.11042953, "diversity_loss_mlp": 0.0, "epoch": 0.2641400538668719, "flos": 571920486912.0, "grad_norm": 0.09395332240398839, "language_loss": 0.81125695, "learning_rate": 0.0008630051337218466, "loss": 0.82250381, "num_input_tokens_seen": 113769024, "router_z_loss_mlp": 0.14245605, "routerloss_mlp": 0.0, "step": 1373, "time_per_iteration": 2.6700031757354736 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118707, "balance_loss_mlp": 1.10506988, "diversity_loss_mlp": 0.0, "epoch": 0.2643324355521354, "flos": 582251037696.0, "grad_norm": 0.0808240378873911, "language_loss": 0.82403839, "learning_rate": 0.0008627908206518409, "loss": 0.83522546, "num_input_tokens_seen": 113836320, "router_z_loss_mlp": 0.13659668, "routerloss_mlp": 0.0, "step": 1374, "time_per_iteration": 2.6610107421875 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061343, "balance_loss_mlp": 1.05442929, "diversity_loss_mlp": 0.0, "epoch": 0.264524817237399, "flos": 1544678926848.0, "grad_norm": 0.04099598647265769, "language_loss": 0.75151253, "learning_rate": 0.0008625763667339472, "loss": 0.76212597, "num_input_tokens_seen": 114065040, "router_z_loss_mlp": 0.06933594, "routerloss_mlp": 0.0, "step": 1375, "time_per_iteration": 4.979893922805786 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01109458, "balance_loss_mlp": 1.09580863, "diversity_loss_mlp": 0.0, "epoch": 0.26471719892266254, "flos": 518034382848.0, "grad_norm": 0.06989177478220372, "language_loss": 0.91488004, "learning_rate": 0.0008623617720514241, "loss": 0.92597461, "num_input_tokens_seen": 114133488, "router_z_loss_mlp": 0.13659668, "routerloss_mlp": 0.0, "step": 1376, "time_per_iteration": 2.6515755653381348 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01109452, "balance_loss_mlp": 1.09554029, "diversity_loss_mlp": 0.0, "epoch": 0.26490958060792613, "flos": 517189349376.0, "grad_norm": 0.07399727326907257, "language_loss": 0.84706682, "learning_rate": 0.0008621470366875848, "loss": 0.85816133, "num_input_tokens_seen": 114200704, "router_z_loss_mlp": 0.13916016, "routerloss_mlp": 0.0, "step": 1377, "time_per_iteration": 2.599776268005371 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119233, "balance_loss_mlp": 1.10546422, "diversity_loss_mlp": 0.0, "epoch": 0.26510196229318966, "flos": 596574388224.0, "grad_norm": 0.07769258092785128, "language_loss": 0.87980253, "learning_rate": 0.0008619321607257966, "loss": 0.89099485, "num_input_tokens_seen": 114272160, "router_z_loss_mlp": 0.13781738, "routerloss_mlp": 0.0, "step": 1378, "time_per_iteration": 2.678865671157837 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01116065, "balance_loss_mlp": 1.10274947, "diversity_loss_mlp": 0.0, "epoch": 0.26529434397845325, "flos": 685800649728.0, "grad_norm": 0.07519514659764338, "language_loss": 0.82002568, "learning_rate": 0.000861717144249482, "loss": 0.83118635, "num_input_tokens_seen": 114347904, "router_z_loss_mlp": 0.13342285, "routerloss_mlp": 0.0, "step": 1379, "time_per_iteration": 2.8830740451812744 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118616, "balance_loss_mlp": 1.10515702, "diversity_loss_mlp": 0.0, "epoch": 0.26548672566371684, "flos": 424353157632.0, "grad_norm": 0.06542821866252439, "language_loss": 0.89670694, "learning_rate": 0.0008615019873421175, "loss": 0.90789306, "num_input_tokens_seen": 114409952, "router_z_loss_mlp": 0.1348877, "routerloss_mlp": 0.0, "step": 1380, "time_per_iteration": 2.4692320823669434 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124803, "balance_loss_mlp": 1.11096311, "diversity_loss_mlp": 0.0, "epoch": 0.26567910734898037, "flos": 489864526848.0, "grad_norm": 0.08230289019981965, "language_loss": 0.85984069, "learning_rate": 0.0008612866900872349, "loss": 0.87108874, "num_input_tokens_seen": 114474832, "router_z_loss_mlp": 0.1385498, "routerloss_mlp": 0.0, "step": 1381, "time_per_iteration": 2.5671193599700928 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119212, "balance_loss_mlp": 1.10564578, "diversity_loss_mlp": 0.0, "epoch": 0.26587148903424396, "flos": 534203444736.0, "grad_norm": 0.09708901974799254, "language_loss": 0.8800329, "learning_rate": 0.0008610712525684197, "loss": 0.89122504, "num_input_tokens_seen": 114545152, "router_z_loss_mlp": 0.13598633, "routerloss_mlp": 0.0, "step": 1382, "time_per_iteration": 2.673672676086426 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01134856, "balance_loss_mlp": 1.12075388, "diversity_loss_mlp": 0.0, "epoch": 0.2660638707195075, "flos": 1017464094720.0, "grad_norm": 0.08550137436350284, "language_loss": 0.84231853, "learning_rate": 0.0008608556748693121, "loss": 0.85366714, "num_input_tokens_seen": 114626512, "router_z_loss_mlp": 0.14111328, "routerloss_mlp": 0.0, "step": 1383, "time_per_iteration": 3.285391330718994 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0113293, "balance_loss_mlp": 1.11881518, "diversity_loss_mlp": 0.0, "epoch": 0.2662562524047711, "flos": 523981550592.0, "grad_norm": 0.07276264363306281, "language_loss": 0.86098409, "learning_rate": 0.000860639957073607, "loss": 0.87231338, "num_input_tokens_seen": 114701008, "router_z_loss_mlp": 0.14123535, "routerloss_mlp": 0.0, "step": 1384, "time_per_iteration": 2.74979829788208 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130834, "balance_loss_mlp": 1.11668396, "diversity_loss_mlp": 0.0, "epoch": 0.2664486340900346, "flos": 552381202944.0, "grad_norm": 0.07735164598050102, "language_loss": 0.87488532, "learning_rate": 0.0008604240992650534, "loss": 0.88619369, "num_input_tokens_seen": 114771984, "router_z_loss_mlp": 0.14172363, "routerloss_mlp": 0.0, "step": 1385, "time_per_iteration": 2.765714406967163 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0113264, "balance_loss_mlp": 1.11819148, "diversity_loss_mlp": 0.0, "epoch": 0.2666410157752982, "flos": 470157115392.0, "grad_norm": 0.09224305204204497, "language_loss": 0.89344275, "learning_rate": 0.0008602081015274545, "loss": 0.90476912, "num_input_tokens_seen": 114844800, "router_z_loss_mlp": 0.14428711, "routerloss_mlp": 0.0, "step": 1386, "time_per_iteration": 2.7466471195220947 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130382, "balance_loss_mlp": 1.11580229, "diversity_loss_mlp": 0.0, "epoch": 0.2668333974605617, "flos": 569919131136.0, "grad_norm": 0.08049268911379595, "language_loss": 0.83551365, "learning_rate": 0.0008599919639446684, "loss": 0.84681749, "num_input_tokens_seen": 114918544, "router_z_loss_mlp": 0.14562988, "routerloss_mlp": 0.0, "step": 1387, "time_per_iteration": 2.680053234100342 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119435, "balance_loss_mlp": 1.10439074, "diversity_loss_mlp": 0.0, "epoch": 0.2670257791458253, "flos": 398982703104.0, "grad_norm": 0.08313146027802099, "language_loss": 0.80363739, "learning_rate": 0.000859775686600607, "loss": 0.81483173, "num_input_tokens_seen": 114984272, "router_z_loss_mlp": 0.15026855, "routerloss_mlp": 0.0, "step": 1388, "time_per_iteration": 2.5738272666931152 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0114186, "balance_loss_mlp": 1.12722135, "diversity_loss_mlp": 0.0, "epoch": 0.2672181608310889, "flos": 515847647232.0, "grad_norm": 0.08559032433145165, "language_loss": 0.85052109, "learning_rate": 0.0008595592695792367, "loss": 0.86193967, "num_input_tokens_seen": 115054800, "router_z_loss_mlp": 0.14611816, "routerloss_mlp": 0.0, "step": 1389, "time_per_iteration": 2.660012722015381 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112772, "balance_loss_mlp": 1.11312914, "diversity_loss_mlp": 0.0, "epoch": 0.26741054251635243, "flos": 507521023488.0, "grad_norm": 0.07620364037172102, "language_loss": 0.90774226, "learning_rate": 0.0008593427129645778, "loss": 0.91901946, "num_input_tokens_seen": 115120928, "router_z_loss_mlp": 0.14587402, "routerloss_mlp": 0.0, "step": 1390, "time_per_iteration": 2.62744140625 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01131127, "balance_loss_mlp": 1.11615419, "diversity_loss_mlp": 0.0, "epoch": 0.267602924201616, "flos": 576647092224.0, "grad_norm": 0.0742307152228864, "language_loss": 0.85619152, "learning_rate": 0.0008591260168407052, "loss": 0.86750275, "num_input_tokens_seen": 115196688, "router_z_loss_mlp": 0.14953613, "routerloss_mlp": 0.0, "step": 1391, "time_per_iteration": 2.738680124282837 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113811, "balance_loss_mlp": 1.09930313, "diversity_loss_mlp": 0.0, "epoch": 0.26779530588687955, "flos": 523984121856.0, "grad_norm": 0.05574398067767488, "language_loss": 0.82839364, "learning_rate": 0.0008589091812917479, "loss": 0.83953172, "num_input_tokens_seen": 115264912, "router_z_loss_mlp": 0.14501953, "routerloss_mlp": 0.0, "step": 1392, "time_per_iteration": 2.5947506427764893 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01109245, "balance_loss_mlp": 1.09471345, "diversity_loss_mlp": 0.0, "epoch": 0.26798768757214314, "flos": 556771926528.0, "grad_norm": 0.07022348692687568, "language_loss": 0.85257161, "learning_rate": 0.0008586922064018887, "loss": 0.86366403, "num_input_tokens_seen": 115334672, "router_z_loss_mlp": 0.14501953, "routerloss_mlp": 0.0, "step": 1393, "time_per_iteration": 2.6624581813812256 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110195, "balance_loss_mlp": 1.09542501, "diversity_loss_mlp": 0.0, "epoch": 0.2681800692574067, "flos": 930614717952.0, "grad_norm": 0.07561979453055602, "language_loss": 0.89401793, "learning_rate": 0.0008584750922553651, "loss": 0.9051199, "num_input_tokens_seen": 115420032, "router_z_loss_mlp": 0.14746094, "routerloss_mlp": 0.0, "step": 1394, "time_per_iteration": 3.1940202713012695 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01107917, "balance_loss_mlp": 1.0934931, "diversity_loss_mlp": 0.0, "epoch": 0.26837245094267026, "flos": 701080261632.0, "grad_norm": 0.07234350422575066, "language_loss": 0.83740592, "learning_rate": 0.0008582578389364677, "loss": 0.84848505, "num_input_tokens_seen": 115492576, "router_z_loss_mlp": 0.14404297, "routerloss_mlp": 0.0, "step": 1395, "time_per_iteration": 2.8844621181488037 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106129, "balance_loss_mlp": 1.09147811, "diversity_loss_mlp": 0.0, "epoch": 0.26856483262793385, "flos": 593191683072.0, "grad_norm": 0.061968206774760184, "language_loss": 0.91908813, "learning_rate": 0.0008580404465295422, "loss": 0.93014938, "num_input_tokens_seen": 115568368, "router_z_loss_mlp": 0.14648438, "routerloss_mlp": 0.0, "step": 1396, "time_per_iteration": 2.7842769622802734 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106127, "balance_loss_mlp": 1.09155917, "diversity_loss_mlp": 0.0, "epoch": 0.2687572143131974, "flos": 714271882752.0, "grad_norm": 0.07293181793333794, "language_loss": 0.88274646, "learning_rate": 0.0008578229151189876, "loss": 0.89380777, "num_input_tokens_seen": 115651536, "router_z_loss_mlp": 0.14550781, "routerloss_mlp": 0.0, "step": 1397, "time_per_iteration": 2.96771502494812 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110096, "balance_loss_mlp": 1.08638036, "diversity_loss_mlp": 0.0, "epoch": 0.26894959599846097, "flos": 467718561792.0, "grad_norm": 0.08798004746081324, "language_loss": 0.81253606, "learning_rate": 0.0008576052447892573, "loss": 0.82354569, "num_input_tokens_seen": 115715696, "router_z_loss_mlp": 0.14562988, "routerloss_mlp": 0.0, "step": 1398, "time_per_iteration": 2.5413830280303955 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101823, "balance_loss_mlp": 1.08761334, "diversity_loss_mlp": 0.0, "epoch": 0.2691419776837245, "flos": 468701987328.0, "grad_norm": 0.0737959226904994, "language_loss": 0.86320835, "learning_rate": 0.000857387435624858, "loss": 0.87422657, "num_input_tokens_seen": 115780928, "router_z_loss_mlp": 0.1418457, "routerloss_mlp": 0.0, "step": 1399, "time_per_iteration": 2.554016351699829 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00934821, "balance_loss_mlp": 1.63627267, "diversity_loss_mlp": 0.20064378, "epoch": 0.2693343593689881, "flos": 937651396608.0, "grad_norm": 0.02492172823463741, "language_loss": 0.88190895, "learning_rate": 0.0008571694877103513, "loss": 0.89125717, "num_input_tokens_seen": 115874432, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01636335, "step": 1400, "time_per_iteration": 3.307114839553833 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110386, "balance_loss_mlp": 1.09591365, "diversity_loss_mlp": 0.0, "epoch": 0.2695267410542516, "flos": 577600782336.0, "grad_norm": 0.07757128819182789, "language_loss": 0.87680864, "learning_rate": 0.0008569514011303515, "loss": 0.88791251, "num_input_tokens_seen": 115956608, "router_z_loss_mlp": 0.14465332, "routerloss_mlp": 0.0, "step": 1401, "time_per_iteration": 2.800502300262451 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00917512, "balance_loss_mlp": 1.60226941, "diversity_loss_mlp": 0.19939175, "epoch": 0.2697191227395152, "flos": 556823683584.0, "grad_norm": 0.03393521208879438, "language_loss": 0.88186574, "learning_rate": 0.0008567331759695277, "loss": 0.8910408, "num_input_tokens_seen": 116031728, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01668182, "step": 1402, "time_per_iteration": 2.7670016288757324 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108043, "balance_loss_mlp": 1.09297514, "diversity_loss_mlp": 0.0, "epoch": 0.26991150442477874, "flos": 529281547776.0, "grad_norm": 0.0674494366068644, "language_loss": 0.86427194, "learning_rate": 0.0008565148123126023, "loss": 0.87535238, "num_input_tokens_seen": 116104288, "router_z_loss_mlp": 0.1505127, "routerloss_mlp": 0.0, "step": 1403, "time_per_iteration": 2.660659074783325 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094781, "balance_loss_mlp": 1.08053553, "diversity_loss_mlp": 0.0, "epoch": 0.2701038861100423, "flos": 532006797312.0, "grad_norm": 0.059221605294443855, "language_loss": 0.86113608, "learning_rate": 0.0008562963102443516, "loss": 0.8720839, "num_input_tokens_seen": 116177920, "router_z_loss_mlp": 0.14245605, "routerloss_mlp": 0.0, "step": 1404, "time_per_iteration": 2.6982760429382324 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110424, "balance_loss_mlp": 1.090042, "diversity_loss_mlp": 0.0, "epoch": 0.2702962677953059, "flos": 735227020800.0, "grad_norm": 0.08483345099627004, "language_loss": 0.85166299, "learning_rate": 0.0008560776698496056, "loss": 0.86270541, "num_input_tokens_seen": 116251680, "router_z_loss_mlp": 0.14196777, "routerloss_mlp": 0.0, "step": 1405, "time_per_iteration": 2.9167518615722656 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110133, "balance_loss_mlp": 1.09539831, "diversity_loss_mlp": 0.0, "epoch": 0.27048864948056944, "flos": 574761733632.0, "grad_norm": 0.06923600464578249, "language_loss": 0.85861331, "learning_rate": 0.0008558588912132481, "loss": 0.86971468, "num_input_tokens_seen": 116327664, "router_z_loss_mlp": 0.1472168, "routerloss_mlp": 0.0, "step": 1406, "time_per_iteration": 2.8346776962280273 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00696474, "balance_loss_mlp": 1.17983532, "diversity_loss_mlp": 0.18206902, "epoch": 0.27068103116583303, "flos": 1423853489664.0, "grad_norm": 0.0036772550136199766, "language_loss": 0.76458991, "learning_rate": 0.0008556399744202163, "loss": 0.77155459, "num_input_tokens_seen": 116555152, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0155216, "step": 1407, "time_per_iteration": 4.943782091140747 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105422, "balance_loss_mlp": 1.09137964, "diversity_loss_mlp": 0.0, "epoch": 0.27087341285109656, "flos": 531999456768.0, "grad_norm": 0.08329945876184135, "language_loss": 0.82942384, "learning_rate": 0.0008554209195555016, "loss": 0.84047806, "num_input_tokens_seen": 116626016, "router_z_loss_mlp": 0.14050293, "routerloss_mlp": 0.0, "step": 1408, "time_per_iteration": 2.7417516708374023 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01125368, "balance_loss_mlp": 1.11146832, "diversity_loss_mlp": 0.0, "epoch": 0.27106579453636015, "flos": 581378840064.0, "grad_norm": 0.06975199960684045, "language_loss": 0.8827157, "learning_rate": 0.0008552017267041483, "loss": 0.89396936, "num_input_tokens_seen": 116699152, "router_z_loss_mlp": 0.13916016, "routerloss_mlp": 0.0, "step": 1409, "time_per_iteration": 2.6978721618652344 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126015, "balance_loss_mlp": 1.11216331, "diversity_loss_mlp": 0.0, "epoch": 0.2712581762216237, "flos": 506801899008.0, "grad_norm": 0.06710824628929367, "language_loss": 0.83395678, "learning_rate": 0.0008549823959512549, "loss": 0.84521693, "num_input_tokens_seen": 116770912, "router_z_loss_mlp": 0.13867188, "routerloss_mlp": 0.0, "step": 1410, "time_per_iteration": 2.6867637634277344 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01125714, "balance_loss_mlp": 1.11246991, "diversity_loss_mlp": 0.0, "epoch": 0.27145055790688727, "flos": 997442823168.0, "grad_norm": 0.07002470067050659, "language_loss": 0.86486357, "learning_rate": 0.0008547629273819728, "loss": 0.87612069, "num_input_tokens_seen": 116863088, "router_z_loss_mlp": 0.13262939, "routerloss_mlp": 0.0, "step": 1411, "time_per_iteration": 3.410454750061035 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01142962, "balance_loss_mlp": 1.12940812, "diversity_loss_mlp": 0.0, "epoch": 0.2716429395921508, "flos": 546681083904.0, "grad_norm": 0.07619635814943253, "language_loss": 0.83522588, "learning_rate": 0.0008545433210815074, "loss": 0.84665549, "num_input_tokens_seen": 116929504, "router_z_loss_mlp": 0.13586426, "routerloss_mlp": 0.0, "step": 1412, "time_per_iteration": 2.638172149658203 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01139051, "balance_loss_mlp": 1.12536621, "diversity_loss_mlp": 0.0, "epoch": 0.2718353212774144, "flos": 573225113088.0, "grad_norm": 0.06317158203016926, "language_loss": 0.87351668, "learning_rate": 0.0008543235771351176, "loss": 0.88490719, "num_input_tokens_seen": 117004064, "router_z_loss_mlp": 0.13696289, "routerloss_mlp": 0.0, "step": 1413, "time_per_iteration": 2.7705581188201904 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01159735, "balance_loss_mlp": 1.14645457, "diversity_loss_mlp": 0.0, "epoch": 0.272027702962678, "flos": 644305549824.0, "grad_norm": 0.08259318688939964, "language_loss": 0.84684592, "learning_rate": 0.0008541036956281154, "loss": 0.85844326, "num_input_tokens_seen": 117081328, "router_z_loss_mlp": 0.13305664, "routerloss_mlp": 0.0, "step": 1414, "time_per_iteration": 2.8803579807281494 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01147853, "balance_loss_mlp": 1.13435841, "diversity_loss_mlp": 0.0, "epoch": 0.2722200846479415, "flos": 653726827008.0, "grad_norm": 0.09396951476817994, "language_loss": 0.81928164, "learning_rate": 0.0008538836766458665, "loss": 0.83076018, "num_input_tokens_seen": 117156544, "router_z_loss_mlp": 0.13519287, "routerloss_mlp": 0.0, "step": 1415, "time_per_iteration": 2.860991954803467 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01140979, "balance_loss_mlp": 1.12721062, "diversity_loss_mlp": 0.0, "epoch": 0.2724124663332051, "flos": 579631873536.0, "grad_norm": 0.07553622395064079, "language_loss": 0.84927893, "learning_rate": 0.0008536635202737897, "loss": 0.86068869, "num_input_tokens_seen": 117230208, "router_z_loss_mlp": 0.13781738, "routerloss_mlp": 0.0, "step": 1416, "time_per_iteration": 2.848196268081665 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01146453, "balance_loss_mlp": 1.13278019, "diversity_loss_mlp": 0.0, "epoch": 0.2726048480184686, "flos": 537435274752.0, "grad_norm": 0.07031625369418516, "language_loss": 0.82188255, "learning_rate": 0.0008534432265973573, "loss": 0.83334708, "num_input_tokens_seen": 117298080, "router_z_loss_mlp": 0.13696289, "routerloss_mlp": 0.0, "step": 1417, "time_per_iteration": 2.6029789447784424 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01153419, "balance_loss_mlp": 1.13950717, "diversity_loss_mlp": 0.0, "epoch": 0.2727972297037322, "flos": 995797172736.0, "grad_norm": 0.07823597875801033, "language_loss": 0.88322413, "learning_rate": 0.000853222795702095, "loss": 0.89475828, "num_input_tokens_seen": 117396256, "router_z_loss_mlp": 0.13928223, "routerloss_mlp": 0.0, "step": 1418, "time_per_iteration": 3.3933968544006348 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01149691, "balance_loss_mlp": 1.13570726, "diversity_loss_mlp": 0.0, "epoch": 0.27298961138899575, "flos": 606205638144.0, "grad_norm": 0.07267637680100167, "language_loss": 0.83730674, "learning_rate": 0.0008530022276735813, "loss": 0.84880364, "num_input_tokens_seen": 117467936, "router_z_loss_mlp": 0.13977051, "routerloss_mlp": 0.0, "step": 1419, "time_per_iteration": 2.766181707382202 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01134499, "balance_loss_mlp": 1.12086129, "diversity_loss_mlp": 0.0, "epoch": 0.27318199307425933, "flos": 529325964288.0, "grad_norm": 0.06887995103877555, "language_loss": 0.86238861, "learning_rate": 0.0008527815225974489, "loss": 0.87373358, "num_input_tokens_seen": 117538256, "router_z_loss_mlp": 0.13671875, "routerloss_mlp": 0.0, "step": 1420, "time_per_iteration": 2.6471102237701416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135972, "balance_loss_mlp": 1.12148833, "diversity_loss_mlp": 0.0, "epoch": 0.2733743747595229, "flos": 409029129216.0, "grad_norm": 0.10131461494963417, "language_loss": 0.88726115, "learning_rate": 0.0008525606805593829, "loss": 0.89862096, "num_input_tokens_seen": 117599488, "router_z_loss_mlp": 0.14453125, "routerloss_mlp": 0.0, "step": 1421, "time_per_iteration": 2.436647653579712 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118286, "balance_loss_mlp": 1.10405266, "diversity_loss_mlp": 0.0, "epoch": 0.27356675644478645, "flos": 516225747456.0, "grad_norm": 0.0859881194807961, "language_loss": 0.8254106, "learning_rate": 0.0008523397016451213, "loss": 0.83659345, "num_input_tokens_seen": 117664240, "router_z_loss_mlp": 0.14245605, "routerloss_mlp": 0.0, "step": 1422, "time_per_iteration": 2.593588352203369 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103656, "balance_loss_mlp": 1.08907628, "diversity_loss_mlp": 0.0, "epoch": 0.27375913813005004, "flos": 1052342088192.0, "grad_norm": 0.06052148467578676, "language_loss": 0.87038374, "learning_rate": 0.0008521185859404564, "loss": 0.88142037, "num_input_tokens_seen": 117754768, "router_z_loss_mlp": 0.14550781, "routerloss_mlp": 0.0, "step": 1423, "time_per_iteration": 3.3936307430267334 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092129, "balance_loss_mlp": 1.07775199, "diversity_loss_mlp": 0.0, "epoch": 0.27395151981531357, "flos": 624805913088.0, "grad_norm": 0.06977326166261295, "language_loss": 0.8940134, "learning_rate": 0.0008518973335312326, "loss": 0.90493476, "num_input_tokens_seen": 117832816, "router_z_loss_mlp": 0.14355469, "routerloss_mlp": 0.0, "step": 1424, "time_per_iteration": 2.7834270000457764 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081272, "balance_loss_mlp": 1.06702638, "diversity_loss_mlp": 0.0, "epoch": 0.27414390150057716, "flos": 550372506624.0, "grad_norm": 0.119675165593639, "language_loss": 0.83282709, "learning_rate": 0.0008516759445033477, "loss": 0.84363985, "num_input_tokens_seen": 117899168, "router_z_loss_mlp": 0.14245605, "routerloss_mlp": 0.0, "step": 1425, "time_per_iteration": 2.665099859237671 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083767, "balance_loss_mlp": 1.06930685, "diversity_loss_mlp": 0.0, "epoch": 0.2743362831858407, "flos": 539866487808.0, "grad_norm": 0.08266887436661914, "language_loss": 0.85026807, "learning_rate": 0.0008514544189427526, "loss": 0.86110568, "num_input_tokens_seen": 117972384, "router_z_loss_mlp": 0.14477539, "routerloss_mlp": 0.0, "step": 1426, "time_per_iteration": 2.6887404918670654 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01086038, "balance_loss_mlp": 1.07249546, "diversity_loss_mlp": 0.0, "epoch": 0.2745286648711043, "flos": 468590759424.0, "grad_norm": 0.06908859165293682, "language_loss": 0.86575979, "learning_rate": 0.0008512327569354511, "loss": 0.87662017, "num_input_tokens_seen": 118039584, "router_z_loss_mlp": 0.13562012, "routerloss_mlp": 0.0, "step": 1427, "time_per_iteration": 2.5235631465911865 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108142, "balance_loss_mlp": 1.09480238, "diversity_loss_mlp": 0.0, "epoch": 0.2747210465563678, "flos": 472867683840.0, "grad_norm": 0.08987008099145026, "language_loss": 0.8368206, "learning_rate": 0.0008510109585675001, "loss": 0.847902, "num_input_tokens_seen": 118108352, "router_z_loss_mlp": 0.13360596, "routerloss_mlp": 0.0, "step": 1428, "time_per_iteration": 2.613348960876465 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01140613, "balance_loss_mlp": 1.13260245, "diversity_loss_mlp": 0.0, "epoch": 0.2749134282416314, "flos": 1315085372928.0, "grad_norm": 0.05207498704371428, "language_loss": 0.81153345, "learning_rate": 0.0008507890239250093, "loss": 0.82293957, "num_input_tokens_seen": 118331120, "router_z_loss_mlp": 0.08007812, "routerloss_mlp": 0.0, "step": 1429, "time_per_iteration": 4.706013202667236 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01133032, "balance_loss_mlp": 1.11977601, "diversity_loss_mlp": 0.0, "epoch": 0.275105809926895, "flos": 970861718016.0, "grad_norm": 0.09002666847623074, "language_loss": 0.80503839, "learning_rate": 0.0008505669530941415, "loss": 0.8163687, "num_input_tokens_seen": 118415872, "router_z_loss_mlp": 0.13269043, "routerloss_mlp": 0.0, "step": 1430, "time_per_iteration": 3.2976372241973877 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0097004, "balance_loss_mlp": 1.70641518, "diversity_loss_mlp": 0.20088202, "epoch": 0.2752981916121585, "flos": 527344432128.0, "grad_norm": 0.03747760406507578, "language_loss": 0.84294951, "learning_rate": 0.000850344746161112, "loss": 0.85264993, "num_input_tokens_seen": 118483008, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01639144, "step": 1431, "time_per_iteration": 2.6297106742858887 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01139922, "balance_loss_mlp": 1.12685704, "diversity_loss_mlp": 0.0, "epoch": 0.2754905732974221, "flos": 453709071360.0, "grad_norm": 0.08230554095697513, "language_loss": 0.87346137, "learning_rate": 0.0008501224032121894, "loss": 0.88486063, "num_input_tokens_seen": 118545840, "router_z_loss_mlp": 0.13079834, "routerloss_mlp": 0.0, "step": 1432, "time_per_iteration": 2.4853787422180176 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01129049, "balance_loss_mlp": 1.1158998, "diversity_loss_mlp": 0.0, "epoch": 0.27568295498268564, "flos": 497474597376.0, "grad_norm": 0.06557126517551867, "language_loss": 0.82118285, "learning_rate": 0.0008498999243336946, "loss": 0.83247334, "num_input_tokens_seen": 118615168, "router_z_loss_mlp": 0.13171387, "routerloss_mlp": 0.0, "step": 1433, "time_per_iteration": 2.623809576034546 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130575, "balance_loss_mlp": 1.11776567, "diversity_loss_mlp": 0.0, "epoch": 0.2758753366679492, "flos": 608194510848.0, "grad_norm": 0.0832335684907068, "language_loss": 0.87471139, "learning_rate": 0.0008496773096120021, "loss": 0.88601708, "num_input_tokens_seen": 118690384, "router_z_loss_mlp": 0.12817383, "routerloss_mlp": 0.0, "step": 1434, "time_per_iteration": 2.7995760440826416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111971, "balance_loss_mlp": 1.10637057, "diversity_loss_mlp": 0.0, "epoch": 0.27606771835321275, "flos": 740129094144.0, "grad_norm": 0.10286197296711953, "language_loss": 0.84387434, "learning_rate": 0.0008494545591335381, "loss": 0.85507143, "num_input_tokens_seen": 118763024, "router_z_loss_mlp": 0.13354492, "routerloss_mlp": 0.0, "step": 1435, "time_per_iteration": 2.933576822280884 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113068, "balance_loss_mlp": 1.09978795, "diversity_loss_mlp": 0.0, "epoch": 0.27626010003847634, "flos": 554572707840.0, "grad_norm": 0.053150449500146836, "language_loss": 0.86971611, "learning_rate": 0.0008492316729847823, "loss": 0.88084674, "num_input_tokens_seen": 118845536, "router_z_loss_mlp": 0.13293457, "routerloss_mlp": 0.0, "step": 1436, "time_per_iteration": 2.8865604400634766 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110011, "balance_loss_mlp": 1.09676659, "diversity_loss_mlp": 0.0, "epoch": 0.2764524817237399, "flos": 542554661376.0, "grad_norm": 0.08937825724590943, "language_loss": 0.7968539, "learning_rate": 0.0008490086512522664, "loss": 0.80795395, "num_input_tokens_seen": 118919008, "router_z_loss_mlp": 0.13269043, "routerloss_mlp": 0.0, "step": 1437, "time_per_iteration": 2.7166872024536133 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105369, "balance_loss_mlp": 1.0916723, "diversity_loss_mlp": 0.0, "epoch": 0.27664486340900346, "flos": 406246980096.0, "grad_norm": 0.09013751301914075, "language_loss": 0.90582836, "learning_rate": 0.0008487854940225755, "loss": 0.91688204, "num_input_tokens_seen": 118981376, "router_z_loss_mlp": 0.13720703, "routerloss_mlp": 0.0, "step": 1438, "time_per_iteration": 2.4426465034484863 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01102616, "balance_loss_mlp": 1.08844161, "diversity_loss_mlp": 0.0, "epoch": 0.27683724509426705, "flos": 522138410496.0, "grad_norm": 0.09066429268698341, "language_loss": 0.89896768, "learning_rate": 0.0008485622013823466, "loss": 0.90999383, "num_input_tokens_seen": 119050560, "router_z_loss_mlp": 0.14172363, "routerloss_mlp": 0.0, "step": 1439, "time_per_iteration": 2.599177360534668 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090727, "balance_loss_mlp": 1.07675576, "diversity_loss_mlp": 0.0, "epoch": 0.2770296267795306, "flos": 535349855232.0, "grad_norm": 0.08059762035463526, "language_loss": 0.83446515, "learning_rate": 0.00084833877341827, "loss": 0.84537244, "num_input_tokens_seen": 119121104, "router_z_loss_mlp": 0.13977051, "routerloss_mlp": 0.0, "step": 1440, "time_per_iteration": 2.667215347290039 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090283, "balance_loss_mlp": 1.0762167, "diversity_loss_mlp": 0.0, "epoch": 0.27722200846479417, "flos": 487991651328.0, "grad_norm": 0.07889497077341047, "language_loss": 0.80625433, "learning_rate": 0.000848115210217088, "loss": 0.81715715, "num_input_tokens_seen": 119187712, "router_z_loss_mlp": 0.14074707, "routerloss_mlp": 0.0, "step": 1441, "time_per_iteration": 2.5463788509368896 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094415, "balance_loss_mlp": 1.08003855, "diversity_loss_mlp": 0.0, "epoch": 0.2774143901500577, "flos": 618297836544.0, "grad_norm": 0.08443965058939805, "language_loss": 0.81771946, "learning_rate": 0.0008478915118655952, "loss": 0.82866359, "num_input_tokens_seen": 119259264, "router_z_loss_mlp": 0.14367676, "routerloss_mlp": 0.0, "step": 1442, "time_per_iteration": 2.743678569793701 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118232, "balance_loss_mlp": 1.10385561, "diversity_loss_mlp": 0.0, "epoch": 0.2776067718353213, "flos": 513819127296.0, "grad_norm": 0.07019455815968899, "language_loss": 0.86195552, "learning_rate": 0.0008476676784506393, "loss": 0.87313789, "num_input_tokens_seen": 119328304, "router_z_loss_mlp": 0.14367676, "routerloss_mlp": 0.0, "step": 1443, "time_per_iteration": 2.663422107696533 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124691, "balance_loss_mlp": 1.10996866, "diversity_loss_mlp": 0.0, "epoch": 0.2777991535205848, "flos": 1004395811328.0, "grad_norm": 0.08623331537045495, "language_loss": 0.81889486, "learning_rate": 0.0008474437100591201, "loss": 0.83014178, "num_input_tokens_seen": 119412352, "router_z_loss_mlp": 0.14709473, "routerloss_mlp": 0.0, "step": 1444, "time_per_iteration": 3.340557813644409 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01129459, "balance_loss_mlp": 1.11489129, "diversity_loss_mlp": 0.0, "epoch": 0.2779915352058484, "flos": 550278531072.0, "grad_norm": 0.08279806566523454, "language_loss": 0.85577607, "learning_rate": 0.0008472196067779898, "loss": 0.86707067, "num_input_tokens_seen": 119484464, "router_z_loss_mlp": 0.14562988, "routerloss_mlp": 0.0, "step": 1445, "time_per_iteration": 2.675623655319214 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112665, "balance_loss_mlp": 1.09800267, "diversity_loss_mlp": 0.0, "epoch": 0.278183916891112, "flos": 873798160896.0, "grad_norm": 0.10281028137483857, "language_loss": 0.85108185, "learning_rate": 0.0008469953686942531, "loss": 0.86220849, "num_input_tokens_seen": 119557280, "router_z_loss_mlp": 0.14672852, "routerloss_mlp": 0.0, "step": 1446, "time_per_iteration": 3.0647382736206055 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00933894, "balance_loss_mlp": 1.63962197, "diversity_loss_mlp": 0.19544066, "epoch": 0.2783762985763755, "flos": 624064766976.0, "grad_norm": 0.039122045531048345, "language_loss": 0.83261281, "learning_rate": 0.0008467709958949668, "loss": 0.84195173, "num_input_tokens_seen": 119631232, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01636306, "step": 1447, "time_per_iteration": 2.777806043624878 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00932176, "balance_loss_mlp": 1.63710666, "diversity_loss_mlp": 0.19454433, "epoch": 0.2785686802616391, "flos": 581838432768.0, "grad_norm": 0.036668832644649825, "language_loss": 0.85678959, "learning_rate": 0.0008465464884672403, "loss": 0.8661114, "num_input_tokens_seen": 119700224, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01635053, "step": 1448, "time_per_iteration": 2.7313778400421143 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109364, "balance_loss_mlp": 1.07944214, "diversity_loss_mlp": 0.0, "epoch": 0.27876106194690264, "flos": 587333348352.0, "grad_norm": 0.08672786191572247, "language_loss": 0.85892808, "learning_rate": 0.0008463218464982348, "loss": 0.86986446, "num_input_tokens_seen": 119781376, "router_z_loss_mlp": 0.14221191, "routerloss_mlp": 0.0, "step": 1449, "time_per_iteration": 2.8115885257720947 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109775, "balance_loss_mlp": 1.08367157, "diversity_loss_mlp": 0.0, "epoch": 0.27895344363216623, "flos": 875982325248.0, "grad_norm": 0.09681901325388456, "language_loss": 0.8756566, "learning_rate": 0.0008460970700751645, "loss": 0.88663405, "num_input_tokens_seen": 119856672, "router_z_loss_mlp": 0.14086914, "routerloss_mlp": 0.0, "step": 1450, "time_per_iteration": 3.071645975112915 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01093318, "balance_loss_mlp": 1.07963276, "diversity_loss_mlp": 0.0, "epoch": 0.27914582531742976, "flos": 603910245888.0, "grad_norm": 0.09020366192691211, "language_loss": 0.87640095, "learning_rate": 0.000845872159285295, "loss": 0.88733411, "num_input_tokens_seen": 119929008, "router_z_loss_mlp": 0.13708496, "routerloss_mlp": 0.0, "step": 1451, "time_per_iteration": 2.7342164516448975 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051691, "balance_loss_mlp": 1.04301238, "diversity_loss_mlp": 0.0, "epoch": 0.27933820700269335, "flos": 1497738097152.0, "grad_norm": 0.032344288076380935, "language_loss": 0.77766848, "learning_rate": 0.0008456471142159447, "loss": 0.78818536, "num_input_tokens_seen": 120164032, "router_z_loss_mlp": 0.08691406, "routerloss_mlp": 0.0, "step": 1452, "time_per_iteration": 4.95387077331543 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121492, "balance_loss_mlp": 1.10795009, "diversity_loss_mlp": 0.0, "epoch": 0.2795305886879569, "flos": 1031859025920.0, "grad_norm": 0.08097200979220782, "language_loss": 0.86171871, "learning_rate": 0.0008454219349544836, "loss": 0.87293363, "num_input_tokens_seen": 120246784, "router_z_loss_mlp": 0.13562012, "routerloss_mlp": 0.0, "step": 1453, "time_per_iteration": 3.373755693435669 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01127619, "balance_loss_mlp": 1.11439896, "diversity_loss_mlp": 0.0, "epoch": 0.27972297037322047, "flos": 607058012160.0, "grad_norm": 0.0882994281711823, "language_loss": 0.81864405, "learning_rate": 0.000845196621588334, "loss": 0.82992017, "num_input_tokens_seen": 120318208, "router_z_loss_mlp": 0.13244629, "routerloss_mlp": 0.0, "step": 1454, "time_per_iteration": 2.758122682571411 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01147815, "balance_loss_mlp": 1.13453507, "diversity_loss_mlp": 0.0, "epoch": 0.27991535205848406, "flos": 630380123136.0, "grad_norm": 0.06575509380885615, "language_loss": 0.76256007, "learning_rate": 0.0008449711742049706, "loss": 0.7740382, "num_input_tokens_seen": 120393248, "router_z_loss_mlp": 0.13305664, "routerloss_mlp": 0.0, "step": 1455, "time_per_iteration": 2.752345561981201 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01156513, "balance_loss_mlp": 1.1432693, "diversity_loss_mlp": 0.0, "epoch": 0.2801077337437476, "flos": 549297676800.0, "grad_norm": 0.10411587441286801, "language_loss": 0.84306383, "learning_rate": 0.0008447455928919196, "loss": 0.85462898, "num_input_tokens_seen": 120461040, "router_z_loss_mlp": 0.13256836, "routerloss_mlp": 0.0, "step": 1456, "time_per_iteration": 2.6104180812835693 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01146529, "balance_loss_mlp": 1.13327312, "diversity_loss_mlp": 0.0, "epoch": 0.2803001154290112, "flos": 486761177088.0, "grad_norm": 0.07273170046833245, "language_loss": 0.86767292, "learning_rate": 0.0008445198777367595, "loss": 0.87913817, "num_input_tokens_seen": 120530400, "router_z_loss_mlp": 0.1328125, "routerloss_mlp": 0.0, "step": 1457, "time_per_iteration": 2.614743947982788 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01144047, "balance_loss_mlp": 1.13080251, "diversity_loss_mlp": 0.0, "epoch": 0.2804924971142747, "flos": 522074170368.0, "grad_norm": 0.08362811388708001, "language_loss": 0.81054902, "learning_rate": 0.0008442940288271208, "loss": 0.82198954, "num_input_tokens_seen": 120598304, "router_z_loss_mlp": 0.13256836, "routerloss_mlp": 0.0, "step": 1458, "time_per_iteration": 2.615705966949463 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112578, "balance_loss_mlp": 1.11191583, "diversity_loss_mlp": 0.0, "epoch": 0.2806848787995383, "flos": 527697566208.0, "grad_norm": 0.06892977395484212, "language_loss": 0.8688817, "learning_rate": 0.0008440680462506856, "loss": 0.88013953, "num_input_tokens_seen": 120675712, "router_z_loss_mlp": 0.13867188, "routerloss_mlp": 0.0, "step": 1459, "time_per_iteration": 2.810474157333374 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121233, "balance_loss_mlp": 1.10828125, "diversity_loss_mlp": 0.0, "epoch": 0.2808772604848018, "flos": 485493626880.0, "grad_norm": 0.06441288224223744, "language_loss": 0.86424565, "learning_rate": 0.0008438419300951883, "loss": 0.87545788, "num_input_tokens_seen": 120746544, "router_z_loss_mlp": 0.12957764, "routerloss_mlp": 0.0, "step": 1460, "time_per_iteration": 2.6540863513946533 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115517, "balance_loss_mlp": 1.10215354, "diversity_loss_mlp": 0.0, "epoch": 0.2810696421700654, "flos": 618139620864.0, "grad_norm": 0.12446768600100189, "language_loss": 0.86647975, "learning_rate": 0.0008436156804484148, "loss": 0.87763494, "num_input_tokens_seen": 120823520, "router_z_loss_mlp": 0.13378906, "routerloss_mlp": 0.0, "step": 1461, "time_per_iteration": 2.810589075088501 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110833, "balance_loss_mlp": 1.0965395, "diversity_loss_mlp": 0.0, "epoch": 0.28126202385532895, "flos": 454754165760.0, "grad_norm": 0.08490544085138897, "language_loss": 0.88168794, "learning_rate": 0.0008433892973982031, "loss": 0.89279622, "num_input_tokens_seen": 120889568, "router_z_loss_mlp": 0.14294434, "routerloss_mlp": 0.0, "step": 1462, "time_per_iteration": 2.561211347579956 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115366, "balance_loss_mlp": 1.10098886, "diversity_loss_mlp": 0.0, "epoch": 0.28145440554059253, "flos": 530704742400.0, "grad_norm": 0.07295818188475026, "language_loss": 0.84776855, "learning_rate": 0.0008431627810324431, "loss": 0.85892212, "num_input_tokens_seen": 120958480, "router_z_loss_mlp": 0.14367676, "routerloss_mlp": 0.0, "step": 1463, "time_per_iteration": 2.654146671295166 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117739, "balance_loss_mlp": 1.10345769, "diversity_loss_mlp": 0.0, "epoch": 0.2816467872258561, "flos": 452228977152.0, "grad_norm": 0.06893619297503142, "language_loss": 0.8126353, "learning_rate": 0.000842936131439076, "loss": 0.82381272, "num_input_tokens_seen": 121028032, "router_z_loss_mlp": 0.1427002, "routerloss_mlp": 0.0, "step": 1464, "time_per_iteration": 2.6571760177612305 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115394, "balance_loss_mlp": 1.1010766, "diversity_loss_mlp": 0.0, "epoch": 0.28183916891111965, "flos": 472712039424.0, "grad_norm": 0.07879840484237804, "language_loss": 0.87885797, "learning_rate": 0.0008427093487060951, "loss": 0.89001191, "num_input_tokens_seen": 121099280, "router_z_loss_mlp": 0.14294434, "routerloss_mlp": 0.0, "step": 1465, "time_per_iteration": 2.6847336292266846 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101907, "balance_loss_mlp": 1.08776927, "diversity_loss_mlp": 0.0, "epoch": 0.28203155059638324, "flos": 557053479936.0, "grad_norm": 0.06118480673876746, "language_loss": 0.84661305, "learning_rate": 0.000842482432921545, "loss": 0.8576321, "num_input_tokens_seen": 121180240, "router_z_loss_mlp": 0.14135742, "routerloss_mlp": 0.0, "step": 1466, "time_per_iteration": 2.884965181350708 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110751, "balance_loss_mlp": 1.09353852, "diversity_loss_mlp": 0.0, "epoch": 0.28222393228164677, "flos": 416980224000.0, "grad_norm": 0.07927655906335743, "language_loss": 0.87199128, "learning_rate": 0.0008422553841735225, "loss": 0.88306642, "num_input_tokens_seen": 121242736, "router_z_loss_mlp": 0.13977051, "routerloss_mlp": 0.0, "step": 1467, "time_per_iteration": 2.528017997741699 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115631, "balance_loss_mlp": 1.10146928, "diversity_loss_mlp": 0.0, "epoch": 0.28241631396691036, "flos": 604910923776.0, "grad_norm": 0.07348722340160863, "language_loss": 0.84837711, "learning_rate": 0.0008420282025501757, "loss": 0.85953343, "num_input_tokens_seen": 121319248, "router_z_loss_mlp": 0.14135742, "routerloss_mlp": 0.0, "step": 1468, "time_per_iteration": 2.7696359157562256 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115321, "balance_loss_mlp": 1.10156429, "diversity_loss_mlp": 0.0, "epoch": 0.2826086956521739, "flos": 572968152576.0, "grad_norm": 0.07024793700711117, "language_loss": 0.85080296, "learning_rate": 0.0008418008881397043, "loss": 0.86195612, "num_input_tokens_seen": 121392064, "router_z_loss_mlp": 0.13769531, "routerloss_mlp": 0.0, "step": 1469, "time_per_iteration": 2.659646511077881 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115825, "balance_loss_mlp": 1.10241413, "diversity_loss_mlp": 0.0, "epoch": 0.2828010773374375, "flos": 842756949504.0, "grad_norm": 0.12791916727658353, "language_loss": 0.82420468, "learning_rate": 0.0008415734410303595, "loss": 0.83536291, "num_input_tokens_seen": 121475984, "router_z_loss_mlp": 0.13439941, "routerloss_mlp": 0.0, "step": 1470, "time_per_iteration": 3.2350287437438965 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01120259, "balance_loss_mlp": 1.10672879, "diversity_loss_mlp": 0.0, "epoch": 0.28299345902270107, "flos": 542675801088.0, "grad_norm": 0.0700140113394834, "language_loss": 0.90437436, "learning_rate": 0.0008413458613104444, "loss": 0.91557699, "num_input_tokens_seen": 121551024, "router_z_loss_mlp": 0.13549805, "routerloss_mlp": 0.0, "step": 1471, "time_per_iteration": 2.7219245433807373 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111254, "balance_loss_mlp": 1.09766376, "diversity_loss_mlp": 0.0, "epoch": 0.2831858407079646, "flos": 571606626816.0, "grad_norm": 0.07145574186167022, "language_loss": 0.83164495, "learning_rate": 0.0008411181490683129, "loss": 0.84275752, "num_input_tokens_seen": 121624528, "router_z_loss_mlp": 0.1361084, "routerloss_mlp": 0.0, "step": 1472, "time_per_iteration": 2.727936029434204 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01107735, "balance_loss_mlp": 1.09348917, "diversity_loss_mlp": 0.0, "epoch": 0.2833782223932282, "flos": 763826734080.0, "grad_norm": 0.0645149730480124, "language_loss": 0.82377428, "learning_rate": 0.0008408903043923707, "loss": 0.83485162, "num_input_tokens_seen": 121706736, "router_z_loss_mlp": 0.14245605, "routerloss_mlp": 0.0, "step": 1473, "time_per_iteration": 2.9972269535064697 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111455, "balance_loss_mlp": 1.1004951, "diversity_loss_mlp": 0.0, "epoch": 0.2835706040784917, "flos": 539051189760.0, "grad_norm": 0.09233547648167305, "language_loss": 0.81268132, "learning_rate": 0.0008406623273710754, "loss": 0.82382679, "num_input_tokens_seen": 121773008, "router_z_loss_mlp": 0.140625, "routerloss_mlp": 0.0, "step": 1474, "time_per_iteration": 2.5923123359680176 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105938, "balance_loss_mlp": 1.09263408, "diversity_loss_mlp": 0.0, "epoch": 0.2837629857637553, "flos": 530593514496.0, "grad_norm": 0.0761903935255829, "language_loss": 0.8290056, "learning_rate": 0.0008404342180929351, "loss": 0.840065, "num_input_tokens_seen": 121840016, "router_z_loss_mlp": 0.13330078, "routerloss_mlp": 0.0, "step": 1475, "time_per_iteration": 2.664698600769043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121728, "balance_loss_mlp": 1.10819817, "diversity_loss_mlp": 0.0, "epoch": 0.28395536744901884, "flos": 540032044032.0, "grad_norm": 0.08946081876366527, "language_loss": 0.81824017, "learning_rate": 0.00084020597664651, "loss": 0.82945752, "num_input_tokens_seen": 121915008, "router_z_loss_mlp": 0.13549805, "routerloss_mlp": 0.0, "step": 1476, "time_per_iteration": 2.7941510677337646 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113829, "balance_loss_mlp": 1.10019112, "diversity_loss_mlp": 0.0, "epoch": 0.2841477491342824, "flos": 573635146752.0, "grad_norm": 0.09030679544521746, "language_loss": 0.83820337, "learning_rate": 0.0008399776031204111, "loss": 0.84934169, "num_input_tokens_seen": 121987456, "router_z_loss_mlp": 0.13659668, "routerloss_mlp": 0.0, "step": 1477, "time_per_iteration": 2.7508158683776855 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101529, "balance_loss_mlp": 1.08784389, "diversity_loss_mlp": 0.0, "epoch": 0.28434013081954596, "flos": 572068790784.0, "grad_norm": 0.07642048536310797, "language_loss": 0.79864645, "learning_rate": 0.0008397490976033009, "loss": 0.80966175, "num_input_tokens_seen": 122058720, "router_z_loss_mlp": 0.13696289, "routerloss_mlp": 0.0, "step": 1478, "time_per_iteration": 2.6500625610351562 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054127, "balance_loss_mlp": 1.04673624, "diversity_loss_mlp": 0.0, "epoch": 0.28453251250480954, "flos": 1553376310272.0, "grad_norm": 0.0303646120618472, "language_loss": 0.77879643, "learning_rate": 0.000839520460183893, "loss": 0.78933775, "num_input_tokens_seen": 122285792, "router_z_loss_mlp": 0.07373047, "routerloss_mlp": 0.0, "step": 1479, "time_per_iteration": 4.757360935211182 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098606, "balance_loss_mlp": 1.08449173, "diversity_loss_mlp": 0.0, "epoch": 0.28472489419007313, "flos": 749061043200.0, "grad_norm": 0.06570619267025138, "language_loss": 0.85133117, "learning_rate": 0.0008392916909509525, "loss": 0.86231726, "num_input_tokens_seen": 122366608, "router_z_loss_mlp": 0.14111328, "routerloss_mlp": 0.0, "step": 1480, "time_per_iteration": 3.0309877395629883 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01093081, "balance_loss_mlp": 1.07888281, "diversity_loss_mlp": 0.0, "epoch": 0.28491727587533666, "flos": 490158563328.0, "grad_norm": 0.07896332999012158, "language_loss": 0.8543641, "learning_rate": 0.0008390627899932954, "loss": 0.86529493, "num_input_tokens_seen": 122435536, "router_z_loss_mlp": 0.14208984, "routerloss_mlp": 0.0, "step": 1481, "time_per_iteration": 2.5937705039978027 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100254, "balance_loss_mlp": 1.08532953, "diversity_loss_mlp": 0.0, "epoch": 0.28510965756060025, "flos": 729007838208.0, "grad_norm": 0.08879627929694006, "language_loss": 0.88894033, "learning_rate": 0.000838833757399789, "loss": 0.89994287, "num_input_tokens_seen": 122515584, "router_z_loss_mlp": 0.14892578, "routerloss_mlp": 0.0, "step": 1482, "time_per_iteration": 2.95451283454895 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106961, "balance_loss_mlp": 1.09247661, "diversity_loss_mlp": 0.0, "epoch": 0.2853020392458638, "flos": 551573245440.0, "grad_norm": 0.08557616325511565, "language_loss": 0.80760586, "learning_rate": 0.0008386045932593515, "loss": 0.81867552, "num_input_tokens_seen": 122585552, "router_z_loss_mlp": 0.14465332, "routerloss_mlp": 0.0, "step": 1483, "time_per_iteration": 2.6901025772094727 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112082, "balance_loss_mlp": 1.09776473, "diversity_loss_mlp": 0.0, "epoch": 0.28549442093112737, "flos": 754783557120.0, "grad_norm": 0.0661413109298982, "language_loss": 0.86017227, "learning_rate": 0.0008383752976609525, "loss": 0.87129307, "num_input_tokens_seen": 122658928, "router_z_loss_mlp": 0.14318848, "routerloss_mlp": 0.0, "step": 1484, "time_per_iteration": 2.9148330688476562 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01116421, "balance_loss_mlp": 1.1014719, "diversity_loss_mlp": 0.0, "epoch": 0.2856868026163909, "flos": 538589025792.0, "grad_norm": 0.06788684976720215, "language_loss": 0.80004096, "learning_rate": 0.0008381458706936123, "loss": 0.81120521, "num_input_tokens_seen": 122729056, "router_z_loss_mlp": 0.14916992, "routerloss_mlp": 0.0, "step": 1485, "time_per_iteration": 2.681067943572998 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112387, "balance_loss_mlp": 1.09728312, "diversity_loss_mlp": 0.0, "epoch": 0.2858791843016545, "flos": 583772977152.0, "grad_norm": 0.06920905175587555, "language_loss": 0.8725493, "learning_rate": 0.0008379163124464025, "loss": 0.88367319, "num_input_tokens_seen": 122802832, "router_z_loss_mlp": 0.15075684, "routerloss_mlp": 0.0, "step": 1486, "time_per_iteration": 2.7093162536621094 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117865, "balance_loss_mlp": 1.10290396, "diversity_loss_mlp": 0.0, "epoch": 0.286071565986918, "flos": 644812130304.0, "grad_norm": 0.09647963836289664, "language_loss": 0.77093983, "learning_rate": 0.0008376866230084452, "loss": 0.78211844, "num_input_tokens_seen": 122881328, "router_z_loss_mlp": 0.14941406, "routerloss_mlp": 0.0, "step": 1487, "time_per_iteration": 2.8678433895111084 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00910546, "balance_loss_mlp": 1.59136748, "diversity_loss_mlp": 0.19592074, "epoch": 0.2862639476721816, "flos": 491361873408.0, "grad_norm": 0.03660624024989628, "language_loss": 0.86046171, "learning_rate": 0.000837456802468914, "loss": 0.86956716, "num_input_tokens_seen": 122949680, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01690142, "step": 1488, "time_per_iteration": 2.602982997894287 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01102391, "balance_loss_mlp": 1.08787107, "diversity_loss_mlp": 0.0, "epoch": 0.2864563293574452, "flos": 521639170560.0, "grad_norm": 0.0820682475712047, "language_loss": 0.85374725, "learning_rate": 0.0008372268509170331, "loss": 0.86477119, "num_input_tokens_seen": 123024736, "router_z_loss_mlp": 0.14501953, "routerloss_mlp": 0.0, "step": 1489, "time_per_iteration": 2.6895487308502197 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099554, "balance_loss_mlp": 1.08529639, "diversity_loss_mlp": 0.0, "epoch": 0.2866487110427087, "flos": 547118281728.0, "grad_norm": 0.09305985964981825, "language_loss": 0.85262501, "learning_rate": 0.0008369967684420779, "loss": 0.86362052, "num_input_tokens_seen": 123097344, "router_z_loss_mlp": 0.14245605, "routerloss_mlp": 0.0, "step": 1490, "time_per_iteration": 2.7102949619293213 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083179, "balance_loss_mlp": 1.06912422, "diversity_loss_mlp": 0.0, "epoch": 0.2868410927279723, "flos": 482224720896.0, "grad_norm": 0.08804420397834639, "language_loss": 0.84696782, "learning_rate": 0.0008367665551333736, "loss": 0.85779965, "num_input_tokens_seen": 123166240, "router_z_loss_mlp": 0.14074707, "routerloss_mlp": 0.0, "step": 1491, "time_per_iteration": 2.618272304534912 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088636, "balance_loss_mlp": 1.07430756, "diversity_loss_mlp": 0.0, "epoch": 0.28703347441323585, "flos": 724889129472.0, "grad_norm": 0.07991380194683065, "language_loss": 0.85525382, "learning_rate": 0.0008365362110802977, "loss": 0.86614019, "num_input_tokens_seen": 123238160, "router_z_loss_mlp": 0.14343262, "routerloss_mlp": 0.0, "step": 1492, "time_per_iteration": 2.851928234100342 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101019, "balance_loss_mlp": 1.08655906, "diversity_loss_mlp": 0.0, "epoch": 0.28722585609849943, "flos": 634978248192.0, "grad_norm": 0.0838988471662801, "language_loss": 0.82620168, "learning_rate": 0.0008363057363722773, "loss": 0.83721185, "num_input_tokens_seen": 123319504, "router_z_loss_mlp": 0.14453125, "routerloss_mlp": 0.0, "step": 1493, "time_per_iteration": 2.853207588195801 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106374, "balance_loss_mlp": 1.09245062, "diversity_loss_mlp": 0.0, "epoch": 0.28741823778376296, "flos": 510229020672.0, "grad_norm": 0.06826703692619526, "language_loss": 0.84157109, "learning_rate": 0.0008360751310987906, "loss": 0.85263485, "num_input_tokens_seen": 123387008, "router_z_loss_mlp": 0.13928223, "routerloss_mlp": 0.0, "step": 1494, "time_per_iteration": 2.57387638092041 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0113071, "balance_loss_mlp": 1.11695361, "diversity_loss_mlp": 0.0, "epoch": 0.28761061946902655, "flos": 603752030208.0, "grad_norm": 0.058749130100992836, "language_loss": 0.85290074, "learning_rate": 0.0008358443953493666, "loss": 0.86420786, "num_input_tokens_seen": 123471056, "router_z_loss_mlp": 0.13781738, "routerloss_mlp": 0.0, "step": 1495, "time_per_iteration": 2.8883073329925537 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01164777, "balance_loss_mlp": 1.15067482, "diversity_loss_mlp": 0.0, "epoch": 0.28780300115429014, "flos": 407193329664.0, "grad_norm": 0.08087911977453179, "language_loss": 0.88221979, "learning_rate": 0.0008356135292135851, "loss": 0.89386749, "num_input_tokens_seen": 123535024, "router_z_loss_mlp": 0.14086914, "routerloss_mlp": 0.0, "step": 1496, "time_per_iteration": 2.5230934619903564 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01186431, "balance_loss_mlp": 1.17226899, "diversity_loss_mlp": 0.0, "epoch": 0.28799538283955367, "flos": 374929357824.0, "grad_norm": 0.11116302526442519, "language_loss": 0.92429602, "learning_rate": 0.0008353825327810758, "loss": 0.93616039, "num_input_tokens_seen": 123596224, "router_z_loss_mlp": 0.14160156, "routerloss_mlp": 0.0, "step": 1497, "time_per_iteration": 2.420966863632202 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01188369, "balance_loss_mlp": 1.17465985, "diversity_loss_mlp": 0.0, "epoch": 0.28818776452481726, "flos": 591919363584.0, "grad_norm": 0.07094257684914687, "language_loss": 0.8160103, "learning_rate": 0.00083515140614152, "loss": 0.82789397, "num_input_tokens_seen": 123668640, "router_z_loss_mlp": 0.1373291, "routerloss_mlp": 0.0, "step": 1498, "time_per_iteration": 2.7105205059051514 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01172297, "balance_loss_mlp": 1.15901685, "diversity_loss_mlp": 0.0, "epoch": 0.2883801462100808, "flos": 535075642368.0, "grad_norm": 0.09212284213685974, "language_loss": 0.87059236, "learning_rate": 0.0008349201493846485, "loss": 0.88231528, "num_input_tokens_seen": 123740816, "router_z_loss_mlp": 0.13293457, "routerloss_mlp": 0.0, "step": 1499, "time_per_iteration": 2.6807801723480225 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01148211, "balance_loss_mlp": 1.13470435, "diversity_loss_mlp": 0.0, "epoch": 0.2885725278953444, "flos": 480094884864.0, "grad_norm": 0.07375807574735407, "language_loss": 0.88790113, "learning_rate": 0.0008346887626002432, "loss": 0.89938325, "num_input_tokens_seen": 123805968, "router_z_loss_mlp": 0.13525391, "routerloss_mlp": 0.0, "step": 1500, "time_per_iteration": 2.5591442584991455 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00919256, "balance_loss_mlp": 1.60489607, "diversity_loss_mlp": 0.19980004, "epoch": 0.2887649095806079, "flos": 464044391424.0, "grad_norm": 0.030907333217789122, "language_loss": 0.85892522, "learning_rate": 0.000834457245878137, "loss": 0.86811781, "num_input_tokens_seen": 123876576, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0169074, "step": 1501, "time_per_iteration": 2.6543540954589844 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112198, "balance_loss_mlp": 1.10861671, "diversity_loss_mlp": 0.0, "epoch": 0.2889572912658715, "flos": 931032092160.0, "grad_norm": 0.09029230185558035, "language_loss": 0.81450766, "learning_rate": 0.000834225599308212, "loss": 0.82572746, "num_input_tokens_seen": 123967664, "router_z_loss_mlp": 0.13378906, "routerloss_mlp": 0.0, "step": 1502, "time_per_iteration": 3.2493886947631836 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01125349, "balance_loss_mlp": 1.11191428, "diversity_loss_mlp": 0.0, "epoch": 0.28914967295113503, "flos": 570129103872.0, "grad_norm": 0.07343077704271528, "language_loss": 0.85592055, "learning_rate": 0.0008339938229804016, "loss": 0.86717403, "num_input_tokens_seen": 124039680, "router_z_loss_mlp": 0.13458252, "routerloss_mlp": 0.0, "step": 1503, "time_per_iteration": 2.712455987930298 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091173, "balance_loss_mlp": 1.08344853, "diversity_loss_mlp": 0.0, "epoch": 0.2893420546363986, "flos": 1486614643200.0, "grad_norm": 0.040592353184382625, "language_loss": 0.75434822, "learning_rate": 0.0008337619169846895, "loss": 0.76525998, "num_input_tokens_seen": 124278848, "router_z_loss_mlp": 0.07714844, "routerloss_mlp": 0.0, "step": 1504, "time_per_iteration": 4.975377082824707 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117311, "balance_loss_mlp": 1.10320854, "diversity_loss_mlp": 0.0, "epoch": 0.2895344363216622, "flos": 470186850816.0, "grad_norm": 0.10665663300821891, "language_loss": 0.84014988, "learning_rate": 0.0008335298814111094, "loss": 0.85132295, "num_input_tokens_seen": 124346736, "router_z_loss_mlp": 0.14111328, "routerloss_mlp": 0.0, "step": 1505, "time_per_iteration": 2.563352584838867 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119478, "balance_loss_mlp": 1.10572124, "diversity_loss_mlp": 0.0, "epoch": 0.28972681800692573, "flos": 648194835456.0, "grad_norm": 0.07488877863745698, "language_loss": 0.87982982, "learning_rate": 0.0008332977163497455, "loss": 0.89102459, "num_input_tokens_seen": 124420816, "router_z_loss_mlp": 0.13769531, "routerloss_mlp": 0.0, "step": 1506, "time_per_iteration": 2.799177646636963 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.011178, "balance_loss_mlp": 1.10419846, "diversity_loss_mlp": 0.0, "epoch": 0.2899191996921893, "flos": 572224435200.0, "grad_norm": 0.08855239932012744, "language_loss": 0.83522987, "learning_rate": 0.0008330654218907325, "loss": 0.84640789, "num_input_tokens_seen": 124490480, "router_z_loss_mlp": 0.13598633, "routerloss_mlp": 0.0, "step": 1507, "time_per_iteration": 2.7311654090881348 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130891, "balance_loss_mlp": 1.1170032, "diversity_loss_mlp": 0.0, "epoch": 0.29011158137745285, "flos": 661356721152.0, "grad_norm": 0.06185767339129184, "language_loss": 0.82011658, "learning_rate": 0.0008328329981242548, "loss": 0.83142549, "num_input_tokens_seen": 124564960, "router_z_loss_mlp": 0.13903809, "routerloss_mlp": 0.0, "step": 1508, "time_per_iteration": 2.87014102935791 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01148949, "balance_loss_mlp": 1.13483465, "diversity_loss_mlp": 0.0, "epoch": 0.29030396306271644, "flos": 536226822144.0, "grad_norm": 0.0780337340178098, "language_loss": 0.88045996, "learning_rate": 0.0008326004451405475, "loss": 0.89194947, "num_input_tokens_seen": 124637424, "router_z_loss_mlp": 0.14111328, "routerloss_mlp": 0.0, "step": 1509, "time_per_iteration": 2.7449288368225098 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01146827, "balance_loss_mlp": 1.13290334, "diversity_loss_mlp": 0.0, "epoch": 0.29049634474798, "flos": 511956163584.0, "grad_norm": 0.07615169765943663, "language_loss": 0.82328165, "learning_rate": 0.0008323677630298957, "loss": 0.83474988, "num_input_tokens_seen": 124704832, "router_z_loss_mlp": 0.13928223, "routerloss_mlp": 0.0, "step": 1510, "time_per_iteration": 2.5527472496032715 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00911058, "balance_loss_mlp": 1.59209251, "diversity_loss_mlp": 0.19929613, "epoch": 0.29068872643324356, "flos": 613758809088.0, "grad_norm": 0.030084219280472915, "language_loss": 0.84789264, "learning_rate": 0.0008321349518826345, "loss": 0.85700321, "num_input_tokens_seen": 124779600, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01536426, "step": 1511, "time_per_iteration": 2.85006046295166 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01167449, "balance_loss_mlp": 1.15337038, "diversity_loss_mlp": 0.0, "epoch": 0.2908811081185071, "flos": 546424123392.0, "grad_norm": 0.09547204503407083, "language_loss": 0.94614309, "learning_rate": 0.0008319020117891491, "loss": 0.95781755, "num_input_tokens_seen": 124844128, "router_z_loss_mlp": 0.14086914, "routerloss_mlp": 0.0, "step": 1512, "time_per_iteration": 2.619699001312256 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01150869, "balance_loss_mlp": 1.13603973, "diversity_loss_mlp": 0.0, "epoch": 0.2910734898037707, "flos": 604792355328.0, "grad_norm": 0.0903449194731753, "language_loss": 0.86757064, "learning_rate": 0.0008316689428398751, "loss": 0.87907934, "num_input_tokens_seen": 124915376, "router_z_loss_mlp": 0.14819336, "routerloss_mlp": 0.0, "step": 1513, "time_per_iteration": 2.6975061893463135 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01122355, "balance_loss_mlp": 1.10804975, "diversity_loss_mlp": 0.0, "epoch": 0.29126587148903427, "flos": 574672900608.0, "grad_norm": 0.05700485295001885, "language_loss": 0.88661957, "learning_rate": 0.0008314357451252979, "loss": 0.89784312, "num_input_tokens_seen": 124995504, "router_z_loss_mlp": 0.14306641, "routerloss_mlp": 0.0, "step": 1514, "time_per_iteration": 2.7759623527526855 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101866, "balance_loss_mlp": 1.08762062, "diversity_loss_mlp": 0.0, "epoch": 0.2914582531742978, "flos": 571068112896.0, "grad_norm": 0.06876651723291546, "language_loss": 0.87979865, "learning_rate": 0.0008312024187359527, "loss": 0.89081734, "num_input_tokens_seen": 125064192, "router_z_loss_mlp": 0.14245605, "routerloss_mlp": 0.0, "step": 1515, "time_per_iteration": 2.6594746112823486 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108928, "balance_loss_mlp": 1.07499838, "diversity_loss_mlp": 0.0, "epoch": 0.2916506348595614, "flos": 730878142464.0, "grad_norm": 0.06943657009436902, "language_loss": 0.87168229, "learning_rate": 0.000830968963762425, "loss": 0.88257504, "num_input_tokens_seen": 125150560, "router_z_loss_mlp": 0.14282227, "routerloss_mlp": 0.0, "step": 1516, "time_per_iteration": 3.0544168949127197 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078645, "balance_loss_mlp": 1.06457818, "diversity_loss_mlp": 0.0, "epoch": 0.2918430165448249, "flos": 510468728832.0, "grad_norm": 0.07942748937188983, "language_loss": 0.84183443, "learning_rate": 0.0008307353802953497, "loss": 0.85262084, "num_input_tokens_seen": 125219264, "router_z_loss_mlp": 0.14074707, "routerloss_mlp": 0.0, "step": 1517, "time_per_iteration": 2.7325901985168457 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075057, "balance_loss_mlp": 1.06031072, "diversity_loss_mlp": 0.0, "epoch": 0.2920353982300885, "flos": 630397375488.0, "grad_norm": 0.0903207444065502, "language_loss": 0.86203992, "learning_rate": 0.0008305016684254125, "loss": 0.87279052, "num_input_tokens_seen": 125301904, "router_z_loss_mlp": 0.1472168, "routerloss_mlp": 0.0, "step": 1518, "time_per_iteration": 2.790580987930298 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073552, "balance_loss_mlp": 1.05908012, "diversity_loss_mlp": 0.0, "epoch": 0.29222777991535204, "flos": 501662688768.0, "grad_norm": 0.07640210633127195, "language_loss": 0.86818451, "learning_rate": 0.0008302678282433479, "loss": 0.87892002, "num_input_tokens_seen": 125367712, "router_z_loss_mlp": 0.14465332, "routerloss_mlp": 0.0, "step": 1519, "time_per_iteration": 2.594045400619507 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077986, "balance_loss_mlp": 1.06394291, "diversity_loss_mlp": 0.0, "epoch": 0.2924201616006156, "flos": 486785769984.0, "grad_norm": 0.07607218771192015, "language_loss": 0.84937745, "learning_rate": 0.0008300338598399411, "loss": 0.86015737, "num_input_tokens_seen": 125437648, "router_z_loss_mlp": 0.14050293, "routerloss_mlp": 0.0, "step": 1520, "time_per_iteration": 2.6176183223724365 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00897129, "balance_loss_mlp": 1.56367016, "diversity_loss_mlp": 0.19839743, "epoch": 0.2926125432858792, "flos": 476450449920.0, "grad_norm": 0.03454500929264816, "language_loss": 0.94754219, "learning_rate": 0.0008297997633060263, "loss": 0.95651346, "num_input_tokens_seen": 125502432, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0160955, "step": 1521, "time_per_iteration": 2.5507402420043945 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098471, "balance_loss_mlp": 1.08445215, "diversity_loss_mlp": 0.0, "epoch": 0.29280492497114274, "flos": 676675980288.0, "grad_norm": 0.07923859397995789, "language_loss": 0.84868819, "learning_rate": 0.0008295655387324883, "loss": 0.8596729, "num_input_tokens_seen": 125575424, "router_z_loss_mlp": 0.14038086, "routerloss_mlp": 0.0, "step": 1522, "time_per_iteration": 2.942894458770752 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103286, "balance_loss_mlp": 1.08957708, "diversity_loss_mlp": 0.0, "epoch": 0.29299730665640633, "flos": 458408512512.0, "grad_norm": 0.09185291067452052, "language_loss": 0.84979212, "learning_rate": 0.0008293311862102609, "loss": 0.86082506, "num_input_tokens_seen": 125639040, "router_z_loss_mlp": 0.13708496, "routerloss_mlp": 0.0, "step": 1523, "time_per_iteration": 2.555556297302246 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115952, "balance_loss_mlp": 1.10218382, "diversity_loss_mlp": 0.0, "epoch": 0.29318968834166986, "flos": 446573274624.0, "grad_norm": 0.07878242279946136, "language_loss": 0.88546365, "learning_rate": 0.0008290967058303275, "loss": 0.89662319, "num_input_tokens_seen": 125701712, "router_z_loss_mlp": 0.13781738, "routerloss_mlp": 0.0, "step": 1524, "time_per_iteration": 2.5723721981048584 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117022, "balance_loss_mlp": 1.10387325, "diversity_loss_mlp": 0.0, "epoch": 0.29338207002693345, "flos": 450319025664.0, "grad_norm": 0.07157234250277994, "language_loss": 0.86573815, "learning_rate": 0.0008288620976837219, "loss": 0.87690842, "num_input_tokens_seen": 125765088, "router_z_loss_mlp": 0.13171387, "routerloss_mlp": 0.0, "step": 1525, "time_per_iteration": 2.539079427719116 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01116802, "balance_loss_mlp": 1.10354626, "diversity_loss_mlp": 0.0, "epoch": 0.293574451712197, "flos": 502277925888.0, "grad_norm": 0.07300174969402286, "language_loss": 0.82548958, "learning_rate": 0.000828627361861527, "loss": 0.83665758, "num_input_tokens_seen": 125831328, "router_z_loss_mlp": 0.1328125, "routerloss_mlp": 0.0, "step": 1526, "time_per_iteration": 2.5784413814544678 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117225, "balance_loss_mlp": 1.10368335, "diversity_loss_mlp": 0.0, "epoch": 0.29376683339746057, "flos": 696462312960.0, "grad_norm": 0.105387273671708, "language_loss": 0.84438479, "learning_rate": 0.0008283924984548752, "loss": 0.85555708, "num_input_tokens_seen": 125903664, "router_z_loss_mlp": 0.13549805, "routerloss_mlp": 0.0, "step": 1527, "time_per_iteration": 2.876854181289673 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136626, "balance_loss_mlp": 1.12352467, "diversity_loss_mlp": 0.0, "epoch": 0.2939592150827241, "flos": 478590197760.0, "grad_norm": 0.07473419184062492, "language_loss": 0.84776825, "learning_rate": 0.0008281575075549485, "loss": 0.8591345, "num_input_tokens_seen": 125971856, "router_z_loss_mlp": 0.13110352, "routerloss_mlp": 0.0, "step": 1528, "time_per_iteration": 2.5660881996154785 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103997, "balance_loss_mlp": 1.09631968, "diversity_loss_mlp": 0.0, "epoch": 0.2941515967679877, "flos": 1485260831232.0, "grad_norm": 0.053938657910520806, "language_loss": 0.77352691, "learning_rate": 0.000827922389252979, "loss": 0.78456688, "num_input_tokens_seen": 126183968, "router_z_loss_mlp": 0.07666016, "routerloss_mlp": 0.0, "step": 1529, "time_per_iteration": 4.633493423461914 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01149647, "balance_loss_mlp": 1.13666511, "diversity_loss_mlp": 0.0, "epoch": 0.2943439784532513, "flos": 674158132224.0, "grad_norm": 0.07225715112962865, "language_loss": 0.90511358, "learning_rate": 0.0008276871436402469, "loss": 0.91661, "num_input_tokens_seen": 126254448, "router_z_loss_mlp": 0.13000488, "routerloss_mlp": 0.0, "step": 1530, "time_per_iteration": 2.8149213790893555 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01156897, "balance_loss_mlp": 1.14402199, "diversity_loss_mlp": 0.0, "epoch": 0.2945363601385148, "flos": 576301298688.0, "grad_norm": 0.10076437192912456, "language_loss": 0.87526608, "learning_rate": 0.000827451770808083, "loss": 0.88683504, "num_input_tokens_seen": 126328208, "router_z_loss_mlp": 0.12890625, "routerloss_mlp": 0.0, "step": 1531, "time_per_iteration": 2.7307019233703613 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01137224, "balance_loss_mlp": 1.12402749, "diversity_loss_mlp": 0.0, "epoch": 0.2947287418237784, "flos": 480655793664.0, "grad_norm": 0.07118672956881426, "language_loss": 0.8318634, "learning_rate": 0.0008272162708478674, "loss": 0.84323561, "num_input_tokens_seen": 126396464, "router_z_loss_mlp": 0.13220215, "routerloss_mlp": 0.0, "step": 1532, "time_per_iteration": 2.559326648712158 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135091, "balance_loss_mlp": 1.1222167, "diversity_loss_mlp": 0.0, "epoch": 0.2949211235090419, "flos": 558185209344.0, "grad_norm": 0.07324079883183283, "language_loss": 0.86170006, "learning_rate": 0.000826980643851029, "loss": 0.87305093, "num_input_tokens_seen": 126468960, "router_z_loss_mlp": 0.12890625, "routerloss_mlp": 0.0, "step": 1533, "time_per_iteration": 2.728351354598999 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01120692, "balance_loss_mlp": 1.10734081, "diversity_loss_mlp": 0.0, "epoch": 0.2951135051943055, "flos": 483887623680.0, "grad_norm": 0.07850912920042735, "language_loss": 0.84523225, "learning_rate": 0.0008267448899090464, "loss": 0.85643911, "num_input_tokens_seen": 126536496, "router_z_loss_mlp": 0.13378906, "routerloss_mlp": 0.0, "step": 1534, "time_per_iteration": 2.595296859741211 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121931, "balance_loss_mlp": 1.10788798, "diversity_loss_mlp": 0.0, "epoch": 0.29530588687956905, "flos": 550295783424.0, "grad_norm": 0.07265790711823701, "language_loss": 0.80930066, "learning_rate": 0.0008265090091134473, "loss": 0.82051992, "num_input_tokens_seen": 126614048, "router_z_loss_mlp": 0.14038086, "routerloss_mlp": 0.0, "step": 1535, "time_per_iteration": 2.8336315155029297 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105358, "balance_loss_mlp": 1.09133863, "diversity_loss_mlp": 0.0, "epoch": 0.29549826856483263, "flos": 673046226432.0, "grad_norm": 0.08467148330579209, "language_loss": 0.80271345, "learning_rate": 0.0008262730015558088, "loss": 0.81376696, "num_input_tokens_seen": 126697248, "router_z_loss_mlp": 0.14025879, "routerloss_mlp": 0.0, "step": 1536, "time_per_iteration": 2.9066760540008545 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01102197, "balance_loss_mlp": 1.08847594, "diversity_loss_mlp": 0.0, "epoch": 0.29569065025009617, "flos": 764666625024.0, "grad_norm": 0.07407642769484, "language_loss": 0.81805962, "learning_rate": 0.0008260368673277574, "loss": 0.82908159, "num_input_tokens_seen": 126782496, "router_z_loss_mlp": 0.1373291, "routerloss_mlp": 0.0, "step": 1537, "time_per_iteration": 3.1795482635498047 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106736, "balance_loss_mlp": 1.09302735, "diversity_loss_mlp": 0.0, "epoch": 0.29588303193535975, "flos": 543683819520.0, "grad_norm": 0.06784415515848828, "language_loss": 0.84026253, "learning_rate": 0.0008258006065209682, "loss": 0.85132986, "num_input_tokens_seen": 126857328, "router_z_loss_mlp": 0.13720703, "routerloss_mlp": 0.0, "step": 1538, "time_per_iteration": 2.766732931137085 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112658, "balance_loss_mlp": 1.09863889, "diversity_loss_mlp": 0.0, "epoch": 0.29607541362062334, "flos": 596947345920.0, "grad_norm": 0.0747520981493109, "language_loss": 0.80543184, "learning_rate": 0.0008255642192271657, "loss": 0.81655836, "num_input_tokens_seen": 126932608, "router_z_loss_mlp": 0.14038086, "routerloss_mlp": 0.0, "step": 1539, "time_per_iteration": 2.792191505432129 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130833, "balance_loss_mlp": 1.11683834, "diversity_loss_mlp": 0.0, "epoch": 0.29626779530588687, "flos": 609877237248.0, "grad_norm": 0.06277821647748005, "language_loss": 0.83592129, "learning_rate": 0.0008253277055381241, "loss": 0.8472296, "num_input_tokens_seen": 127008928, "router_z_loss_mlp": 0.14013672, "routerloss_mlp": 0.0, "step": 1540, "time_per_iteration": 2.8384311199188232 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138407, "balance_loss_mlp": 1.12428069, "diversity_loss_mlp": 0.0, "epoch": 0.29646017699115046, "flos": 867430674432.0, "grad_norm": 0.09924754491110549, "language_loss": 0.85482454, "learning_rate": 0.0008250910655456658, "loss": 0.86620867, "num_input_tokens_seen": 127097104, "router_z_loss_mlp": 0.14147949, "routerloss_mlp": 0.0, "step": 1541, "time_per_iteration": 3.1718008518218994 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01133859, "balance_loss_mlp": 1.12016189, "diversity_loss_mlp": 0.0, "epoch": 0.296652558676414, "flos": 495868594176.0, "grad_norm": 0.07747440640117766, "language_loss": 0.83370835, "learning_rate": 0.0008248542993416625, "loss": 0.84504688, "num_input_tokens_seen": 127165264, "router_z_loss_mlp": 0.13708496, "routerloss_mlp": 0.0, "step": 1542, "time_per_iteration": 2.5952396392822266 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01127147, "balance_loss_mlp": 1.11278272, "diversity_loss_mlp": 0.0, "epoch": 0.2968449403616776, "flos": 571544957952.0, "grad_norm": 0.08018137719350796, "language_loss": 0.83926904, "learning_rate": 0.0008246174070180352, "loss": 0.85054052, "num_input_tokens_seen": 127238992, "router_z_loss_mlp": 0.14355469, "routerloss_mlp": 0.0, "step": 1543, "time_per_iteration": 2.6775217056274414 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115598, "balance_loss_mlp": 1.10168624, "diversity_loss_mlp": 0.0, "epoch": 0.2970373220469411, "flos": 794168271360.0, "grad_norm": 0.09273281815149376, "language_loss": 0.83928716, "learning_rate": 0.0008243803886667537, "loss": 0.85044312, "num_input_tokens_seen": 127328160, "router_z_loss_mlp": 0.13916016, "routerloss_mlp": 0.0, "step": 1544, "time_per_iteration": 3.0925238132476807 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110422, "balance_loss_mlp": 1.09024858, "diversity_loss_mlp": 0.0, "epoch": 0.2972297037322047, "flos": 661038091776.0, "grad_norm": 0.06593992881851045, "language_loss": 0.79115343, "learning_rate": 0.0008241432443798364, "loss": 0.80219567, "num_input_tokens_seen": 127407328, "router_z_loss_mlp": 0.13989258, "routerloss_mlp": 0.0, "step": 1545, "time_per_iteration": 2.839099407196045 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088557, "balance_loss_mlp": 1.07518196, "diversity_loss_mlp": 0.0, "epoch": 0.29742208541746823, "flos": 597125385216.0, "grad_norm": 0.05453506209022983, "language_loss": 0.85691601, "learning_rate": 0.0008239059742493512, "loss": 0.86780155, "num_input_tokens_seen": 127477136, "router_z_loss_mlp": 0.1340332, "routerloss_mlp": 0.0, "step": 1546, "time_per_iteration": 2.7476751804351807 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088565, "balance_loss_mlp": 1.07480812, "diversity_loss_mlp": 0.0, "epoch": 0.2976144671027318, "flos": 769882558464.0, "grad_norm": 0.06672989003234615, "language_loss": 0.87117672, "learning_rate": 0.0008236685783674142, "loss": 0.88206244, "num_input_tokens_seen": 127565680, "router_z_loss_mlp": 0.13769531, "routerloss_mlp": 0.0, "step": 1547, "time_per_iteration": 3.0519776344299316 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107831, "balance_loss_mlp": 1.06796312, "diversity_loss_mlp": 0.0, "epoch": 0.2978068487879954, "flos": 1484764162560.0, "grad_norm": 0.04305360715769565, "language_loss": 0.76221192, "learning_rate": 0.0008234310568261911, "loss": 0.772995, "num_input_tokens_seen": 127791584, "router_z_loss_mlp": 0.10351562, "routerloss_mlp": 0.0, "step": 1548, "time_per_iteration": 4.883166790008545 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084727, "balance_loss_mlp": 1.07123256, "diversity_loss_mlp": 0.0, "epoch": 0.29799923047325894, "flos": 475328632320.0, "grad_norm": 0.11160876507978217, "language_loss": 0.82253683, "learning_rate": 0.0008231934097178955, "loss": 0.8333841, "num_input_tokens_seen": 127860112, "router_z_loss_mlp": 0.1350708, "routerloss_mlp": 0.0, "step": 1549, "time_per_iteration": 2.60786771774292 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092454, "balance_loss_mlp": 1.07919788, "diversity_loss_mlp": 0.0, "epoch": 0.2981916121585225, "flos": 759804198912.0, "grad_norm": 0.07843428838445873, "language_loss": 0.85328496, "learning_rate": 0.0008229556371347903, "loss": 0.86420953, "num_input_tokens_seen": 127938752, "router_z_loss_mlp": 0.1328125, "routerloss_mlp": 0.0, "step": 1550, "time_per_iteration": 2.962412118911743 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106892, "balance_loss_mlp": 1.09379029, "diversity_loss_mlp": 0.0, "epoch": 0.29838399384378606, "flos": 875016152064.0, "grad_norm": 0.0840525031564576, "language_loss": 0.79399186, "learning_rate": 0.0008227177391691874, "loss": 0.80506086, "num_input_tokens_seen": 128022192, "router_z_loss_mlp": 0.13122559, "routerloss_mlp": 0.0, "step": 1551, "time_per_iteration": 3.1673550605773926 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111848, "balance_loss_mlp": 1.09871709, "diversity_loss_mlp": 0.0, "epoch": 0.29857637552904964, "flos": 579661608960.0, "grad_norm": 0.07195743014481873, "language_loss": 0.89281148, "learning_rate": 0.0008224797159134463, "loss": 0.90392995, "num_input_tokens_seen": 128097776, "router_z_loss_mlp": 0.13146973, "routerloss_mlp": 0.0, "step": 1552, "time_per_iteration": 2.7333877086639404 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121508, "balance_loss_mlp": 1.10890126, "diversity_loss_mlp": 0.0, "epoch": 0.2987687572143132, "flos": 836399748096.0, "grad_norm": 0.07485820549569244, "language_loss": 0.83144093, "learning_rate": 0.0008222415674599765, "loss": 0.84265602, "num_input_tokens_seen": 128179888, "router_z_loss_mlp": 0.12609863, "routerloss_mlp": 0.0, "step": 1553, "time_per_iteration": 3.077017068862915 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135128, "balance_loss_mlp": 1.12165701, "diversity_loss_mlp": 0.0, "epoch": 0.29896113889957676, "flos": 567072741888.0, "grad_norm": 0.08671551895934956, "language_loss": 0.83149582, "learning_rate": 0.0008220032939012349, "loss": 0.84284711, "num_input_tokens_seen": 128251152, "router_z_loss_mlp": 0.13500977, "routerloss_mlp": 0.0, "step": 1554, "time_per_iteration": 2.6689035892486572 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115966, "balance_loss_mlp": 1.10284674, "diversity_loss_mlp": 0.0, "epoch": 0.29915352058484035, "flos": 498662853120.0, "grad_norm": 0.06666483036401037, "language_loss": 0.87800217, "learning_rate": 0.0008217648953297277, "loss": 0.88916183, "num_input_tokens_seen": 128327600, "router_z_loss_mlp": 0.13128662, "routerloss_mlp": 0.0, "step": 1555, "time_per_iteration": 2.8417294025421143 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119981, "balance_loss_mlp": 1.10677278, "diversity_loss_mlp": 0.0, "epoch": 0.2993459022701039, "flos": 592112083968.0, "grad_norm": 0.08472740856632217, "language_loss": 0.78017807, "learning_rate": 0.0008215263718380095, "loss": 0.7913779, "num_input_tokens_seen": 128398432, "router_z_loss_mlp": 0.13220215, "routerloss_mlp": 0.0, "step": 1556, "time_per_iteration": 2.682047128677368 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01096383, "balance_loss_mlp": 1.08319807, "diversity_loss_mlp": 0.0, "epoch": 0.29953828395536747, "flos": 572380079616.0, "grad_norm": 0.07743195715790333, "language_loss": 0.84389544, "learning_rate": 0.0008212877235186833, "loss": 0.85485923, "num_input_tokens_seen": 128469696, "router_z_loss_mlp": 0.13201904, "routerloss_mlp": 0.0, "step": 1557, "time_per_iteration": 2.6532580852508545 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074398, "balance_loss_mlp": 1.06710196, "diversity_loss_mlp": 0.0, "epoch": 0.299730665640631, "flos": 1504698425856.0, "grad_norm": 0.04061005434024277, "language_loss": 0.77737558, "learning_rate": 0.0008210489504644005, "loss": 0.78811955, "num_input_tokens_seen": 128698560, "router_z_loss_mlp": 0.07275391, "routerloss_mlp": 0.0, "step": 1558, "time_per_iteration": 4.923272132873535 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092088, "balance_loss_mlp": 1.07896352, "diversity_loss_mlp": 0.0, "epoch": 0.2999230473258946, "flos": 513791963136.0, "grad_norm": 0.10565427097675566, "language_loss": 0.8116585, "learning_rate": 0.0008208100527678611, "loss": 0.82257938, "num_input_tokens_seen": 128765952, "router_z_loss_mlp": 0.13146973, "routerloss_mlp": 0.0, "step": 1559, "time_per_iteration": 2.602773427963257 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084055, "balance_loss_mlp": 1.07101393, "diversity_loss_mlp": 0.0, "epoch": 0.3001154290111581, "flos": 834472544256.0, "grad_norm": 0.11780548804152448, "language_loss": 0.78494406, "learning_rate": 0.0008205710305218135, "loss": 0.79578459, "num_input_tokens_seen": 128840048, "router_z_loss_mlp": 0.13061523, "routerloss_mlp": 0.0, "step": 1560, "time_per_iteration": 3.013576030731201 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089526, "balance_loss_mlp": 1.07663918, "diversity_loss_mlp": 0.0, "epoch": 0.3003078106964217, "flos": 556776695808.0, "grad_norm": 0.08018423106971302, "language_loss": 0.89838511, "learning_rate": 0.0008203318838190541, "loss": 0.9092803, "num_input_tokens_seen": 128912496, "router_z_loss_mlp": 0.12890625, "routerloss_mlp": 0.0, "step": 1561, "time_per_iteration": 2.741619348526001 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108989, "balance_loss_mlp": 1.07702184, "diversity_loss_mlp": 0.0, "epoch": 0.30050019238168524, "flos": 526151033856.0, "grad_norm": 0.09397123990600864, "language_loss": 0.85396177, "learning_rate": 0.0008200926127524281, "loss": 0.86486065, "num_input_tokens_seen": 128980624, "router_z_loss_mlp": 0.12884521, "routerloss_mlp": 0.0, "step": 1562, "time_per_iteration": 2.60974383354187 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106708, "balance_loss_mlp": 1.0936904, "diversity_loss_mlp": 0.0, "epoch": 0.3006925740669488, "flos": 577852973568.0, "grad_norm": 0.08688269643752358, "language_loss": 0.83400619, "learning_rate": 0.0008198532174148289, "loss": 0.84507322, "num_input_tokens_seen": 129050576, "router_z_loss_mlp": 0.13031006, "routerloss_mlp": 0.0, "step": 1563, "time_per_iteration": 2.7336533069610596 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079297, "balance_loss_mlp": 1.07195389, "diversity_loss_mlp": 0.0, "epoch": 0.3008849557522124, "flos": 1490246595072.0, "grad_norm": 0.04112604139988501, "language_loss": 0.8068617, "learning_rate": 0.0008196136978991977, "loss": 0.81765467, "num_input_tokens_seen": 129278880, "router_z_loss_mlp": 0.07324219, "routerloss_mlp": 0.0, "step": 1564, "time_per_iteration": 4.828714609146118 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01145426, "balance_loss_mlp": 1.1324501, "diversity_loss_mlp": 0.0, "epoch": 0.30107733743747594, "flos": 509816415744.0, "grad_norm": 0.08852118135813189, "language_loss": 0.89291, "learning_rate": 0.0008193740542985244, "loss": 0.90436429, "num_input_tokens_seen": 129346560, "router_z_loss_mlp": 0.12988281, "routerloss_mlp": 0.0, "step": 1565, "time_per_iteration": 2.5988731384277344 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01151488, "balance_loss_mlp": 1.13872099, "diversity_loss_mlp": 0.0, "epoch": 0.30126971912273953, "flos": 587704108032.0, "grad_norm": 0.1281977179548432, "language_loss": 0.86354733, "learning_rate": 0.0008191342867058467, "loss": 0.87506223, "num_input_tokens_seen": 129420448, "router_z_loss_mlp": 0.12780762, "routerloss_mlp": 0.0, "step": 1566, "time_per_iteration": 2.6914639472961426 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118723, "balance_loss_mlp": 1.10574174, "diversity_loss_mlp": 0.0, "epoch": 0.30146210080800306, "flos": 602101610496.0, "grad_norm": 0.07018370282969584, "language_loss": 0.83602738, "learning_rate": 0.0008188943952142509, "loss": 0.84721458, "num_input_tokens_seen": 129494032, "router_z_loss_mlp": 0.13000488, "routerloss_mlp": 0.0, "step": 1567, "time_per_iteration": 2.7846438884735107 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111402, "balance_loss_mlp": 1.09847367, "diversity_loss_mlp": 0.0, "epoch": 0.30165448249326665, "flos": 917796054528.0, "grad_norm": 0.08750889372003143, "language_loss": 0.82150149, "learning_rate": 0.0008186543799168711, "loss": 0.83261549, "num_input_tokens_seen": 129569088, "router_z_loss_mlp": 0.12945557, "routerloss_mlp": 0.0, "step": 1568, "time_per_iteration": 3.1300384998321533 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094152, "balance_loss_mlp": 1.08103871, "diversity_loss_mlp": 0.0, "epoch": 0.3018468641785302, "flos": 777287798784.0, "grad_norm": 0.07719475001811499, "language_loss": 0.88627326, "learning_rate": 0.0008184142409068892, "loss": 0.89721477, "num_input_tokens_seen": 129647968, "router_z_loss_mlp": 0.13134766, "routerloss_mlp": 0.0, "step": 1569, "time_per_iteration": 2.9922726154327393 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087737, "balance_loss_mlp": 1.07475495, "diversity_loss_mlp": 0.0, "epoch": 0.30203924586379377, "flos": 522358295040.0, "grad_norm": 0.07345065764158631, "language_loss": 0.86446834, "learning_rate": 0.000818173978277536, "loss": 0.87534571, "num_input_tokens_seen": 129718928, "router_z_loss_mlp": 0.12994385, "routerloss_mlp": 0.0, "step": 1570, "time_per_iteration": 2.695930242538452 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089564, "balance_loss_mlp": 1.07673669, "diversity_loss_mlp": 0.0, "epoch": 0.3022316275490573, "flos": 524559711744.0, "grad_norm": 0.0712021049255776, "language_loss": 0.83337176, "learning_rate": 0.000817933592122089, "loss": 0.84426749, "num_input_tokens_seen": 129790128, "router_z_loss_mlp": 0.12841797, "routerloss_mlp": 0.0, "step": 1571, "time_per_iteration": 2.7131617069244385 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087482, "balance_loss_mlp": 1.07427394, "diversity_loss_mlp": 0.0, "epoch": 0.3024240092343209, "flos": 479912076288.0, "grad_norm": 0.08283074842036095, "language_loss": 0.83667982, "learning_rate": 0.0008176930825338749, "loss": 0.84755468, "num_input_tokens_seen": 129857536, "router_z_loss_mlp": 0.13232422, "routerloss_mlp": 0.0, "step": 1572, "time_per_iteration": 2.5447826385498047 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087862, "balance_loss_mlp": 1.07405734, "diversity_loss_mlp": 0.0, "epoch": 0.3026163909195845, "flos": 687206592000.0, "grad_norm": 0.07741282152017008, "language_loss": 0.88849854, "learning_rate": 0.0008174524496062679, "loss": 0.89937723, "num_input_tokens_seen": 129931440, "router_z_loss_mlp": 0.13818359, "routerloss_mlp": 0.0, "step": 1573, "time_per_iteration": 2.908740997314453 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092114, "balance_loss_mlp": 1.07822633, "diversity_loss_mlp": 0.0, "epoch": 0.302808772604848, "flos": 542940102144.0, "grad_norm": 0.06962859876416791, "language_loss": 0.85499102, "learning_rate": 0.0008172116934326894, "loss": 0.86591208, "num_input_tokens_seen": 130005200, "router_z_loss_mlp": 0.13903809, "routerloss_mlp": 0.0, "step": 1574, "time_per_iteration": 2.751488208770752 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098046, "balance_loss_mlp": 1.08365786, "diversity_loss_mlp": 0.0, "epoch": 0.3030011542901116, "flos": 475091495424.0, "grad_norm": 0.09195920466248479, "language_loss": 0.8794626, "learning_rate": 0.0008169708141066097, "loss": 0.89044309, "num_input_tokens_seen": 130069136, "router_z_loss_mlp": 0.1439209, "routerloss_mlp": 0.0, "step": 1575, "time_per_iteration": 2.5947275161743164 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118908, "balance_loss_mlp": 1.10441208, "diversity_loss_mlp": 0.0, "epoch": 0.30319353597537513, "flos": 481481003520.0, "grad_norm": 0.0784824693742563, "language_loss": 0.90658617, "learning_rate": 0.0008167298117215465, "loss": 0.91777527, "num_input_tokens_seen": 130135456, "router_z_loss_mlp": 0.14501953, "routerloss_mlp": 0.0, "step": 1576, "time_per_iteration": 2.5396125316619873 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.011316, "balance_loss_mlp": 1.11705649, "diversity_loss_mlp": 0.0, "epoch": 0.3033859176606387, "flos": 704786365440.0, "grad_norm": 0.1093253517132677, "language_loss": 0.87566864, "learning_rate": 0.0008164886863710649, "loss": 0.88698471, "num_input_tokens_seen": 130213712, "router_z_loss_mlp": 0.14538574, "routerloss_mlp": 0.0, "step": 1577, "time_per_iteration": 2.931835412979126 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138004, "balance_loss_mlp": 1.12323439, "diversity_loss_mlp": 0.0, "epoch": 0.30357829934590225, "flos": 764696360448.0, "grad_norm": 0.07788016425512684, "language_loss": 0.8637675, "learning_rate": 0.0008162474381487783, "loss": 0.87514758, "num_input_tokens_seen": 130290928, "router_z_loss_mlp": 0.14770508, "routerloss_mlp": 0.0, "step": 1578, "time_per_iteration": 3.041262626647949 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01125978, "balance_loss_mlp": 1.11132693, "diversity_loss_mlp": 0.0, "epoch": 0.30377068103116583, "flos": 532355162112.0, "grad_norm": 0.1532642042193693, "language_loss": 0.84568751, "learning_rate": 0.0008160060671483475, "loss": 0.8569473, "num_input_tokens_seen": 130362672, "router_z_loss_mlp": 0.1463623, "routerloss_mlp": 0.0, "step": 1579, "time_per_iteration": 2.6566197872161865 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110829, "balance_loss_mlp": 1.0942831, "diversity_loss_mlp": 0.0, "epoch": 0.3039630627164294, "flos": 510191944704.0, "grad_norm": 0.10001869607158981, "language_loss": 0.8342396, "learning_rate": 0.0008157645734634809, "loss": 0.84532249, "num_input_tokens_seen": 130428848, "router_z_loss_mlp": 0.14013672, "routerloss_mlp": 0.0, "step": 1580, "time_per_iteration": 2.5994346141815186 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01151521, "balance_loss_mlp": 1.14064956, "diversity_loss_mlp": 0.0, "epoch": 0.30415544440169295, "flos": 1506000854016.0, "grad_norm": 0.06737085519591758, "language_loss": 0.76896489, "learning_rate": 0.000815522957187935, "loss": 0.78048015, "num_input_tokens_seen": 130665440, "router_z_loss_mlp": 0.10888672, "routerloss_mlp": 0.0, "step": 1581, "time_per_iteration": 4.946556329727173 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00631723, "balance_loss_mlp": 1.05820811, "diversity_loss_mlp": 0.17941347, "epoch": 0.30434782608695654, "flos": 1458736625664.0, "grad_norm": 0.002006006723137456, "language_loss": 0.73214495, "learning_rate": 0.0008152812184155132, "loss": 0.73846221, "num_input_tokens_seen": 130895248, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01291206, "step": 1582, "time_per_iteration": 4.897693395614624 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097544, "balance_loss_mlp": 1.08376384, "diversity_loss_mlp": 0.0, "epoch": 0.3045402077722201, "flos": 482555833344.0, "grad_norm": 0.07529557219412701, "language_loss": 0.83949858, "learning_rate": 0.000815039357240067, "loss": 0.85047406, "num_input_tokens_seen": 130964544, "router_z_loss_mlp": 0.13793945, "routerloss_mlp": 0.0, "step": 1583, "time_per_iteration": 2.6096932888031006 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101837, "balance_loss_mlp": 1.0882473, "diversity_loss_mlp": 0.0, "epoch": 0.30473258945748366, "flos": 543501010944.0, "grad_norm": 0.0740498467066553, "language_loss": 0.84922493, "learning_rate": 0.0008147973737554952, "loss": 0.86024332, "num_input_tokens_seen": 131041744, "router_z_loss_mlp": 0.13592529, "routerloss_mlp": 0.0, "step": 1584, "time_per_iteration": 2.7863824367523193 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106775, "balance_loss_mlp": 1.09364963, "diversity_loss_mlp": 0.0, "epoch": 0.3049249711427472, "flos": 567055489536.0, "grad_norm": 0.11669723774220289, "language_loss": 0.85926318, "learning_rate": 0.000814555268055744, "loss": 0.87033093, "num_input_tokens_seen": 131108864, "router_z_loss_mlp": 0.13146973, "routerloss_mlp": 0.0, "step": 1585, "time_per_iteration": 2.6167564392089844 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111589, "balance_loss_mlp": 1.1022768, "diversity_loss_mlp": 0.0, "epoch": 0.3051173528280108, "flos": 528233882112.0, "grad_norm": 0.07476018488685929, "language_loss": 0.87489879, "learning_rate": 0.0008143130402348073, "loss": 0.88605773, "num_input_tokens_seen": 131181104, "router_z_loss_mlp": 0.13647461, "routerloss_mlp": 0.0, "step": 1586, "time_per_iteration": 2.6318202018737793 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112097, "balance_loss_mlp": 1.10742807, "diversity_loss_mlp": 0.0, "epoch": 0.3053097345132743, "flos": 586396910592.0, "grad_norm": 0.07016471467090964, "language_loss": 0.79198885, "learning_rate": 0.0008140706903867265, "loss": 0.80319858, "num_input_tokens_seen": 131258704, "router_z_loss_mlp": 0.13562012, "routerloss_mlp": 0.0, "step": 1587, "time_per_iteration": 2.82663893699646 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128991, "balance_loss_mlp": 1.11541307, "diversity_loss_mlp": 0.0, "epoch": 0.3055021161985379, "flos": 607087747584.0, "grad_norm": 0.09040046070353, "language_loss": 0.90612531, "learning_rate": 0.0008138282186055897, "loss": 0.91741514, "num_input_tokens_seen": 131325712, "router_z_loss_mlp": 0.13586426, "routerloss_mlp": 0.0, "step": 1588, "time_per_iteration": 2.690561294555664 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01142156, "balance_loss_mlp": 1.12872136, "diversity_loss_mlp": 0.0, "epoch": 0.3056944978838015, "flos": 573867514368.0, "grad_norm": 0.07675542780120453, "language_loss": 0.82382154, "learning_rate": 0.0008135856249855331, "loss": 0.83524311, "num_input_tokens_seen": 131397568, "router_z_loss_mlp": 0.13464355, "routerloss_mlp": 0.0, "step": 1589, "time_per_iteration": 2.6935813426971436 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0115907, "balance_loss_mlp": 1.14551568, "diversity_loss_mlp": 0.0, "epoch": 0.305886879569065, "flos": 633925813248.0, "grad_norm": 0.07642745969896261, "language_loss": 0.89603746, "learning_rate": 0.0008133429096207398, "loss": 0.90762818, "num_input_tokens_seen": 131467632, "router_z_loss_mlp": 0.13574219, "routerloss_mlp": 0.0, "step": 1590, "time_per_iteration": 2.7690787315368652 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113549, "balance_loss_mlp": 1.10534787, "diversity_loss_mlp": 0.0, "epoch": 0.3060792612543286, "flos": 1369005981696.0, "grad_norm": 0.03962763613217991, "language_loss": 0.75312257, "learning_rate": 0.0008131000726054403, "loss": 0.76425815, "num_input_tokens_seen": 131702224, "router_z_loss_mlp": 0.08203125, "routerloss_mlp": 0.0, "step": 1591, "time_per_iteration": 4.950432538986206 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01184059, "balance_loss_mlp": 1.17060041, "diversity_loss_mlp": 0.0, "epoch": 0.30627164293959214, "flos": 518555644416.0, "grad_norm": 0.0624915030883944, "language_loss": 0.8671608, "learning_rate": 0.0008128571140339123, "loss": 0.87900144, "num_input_tokens_seen": 131774608, "router_z_loss_mlp": 0.13476562, "routerloss_mlp": 0.0, "step": 1592, "time_per_iteration": 2.717022657394409 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01169875, "balance_loss_mlp": 1.15618944, "diversity_loss_mlp": 0.0, "epoch": 0.3064640246248557, "flos": 455589287424.0, "grad_norm": 0.08640912687422367, "language_loss": 0.87240267, "learning_rate": 0.0008126140340004805, "loss": 0.88410139, "num_input_tokens_seen": 131841216, "router_z_loss_mlp": 0.13696289, "routerloss_mlp": 0.0, "step": 1593, "time_per_iteration": 2.5112054347991943 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01157381, "balance_loss_mlp": 1.14379096, "diversity_loss_mlp": 0.0, "epoch": 0.30665640631011926, "flos": 850095378432.0, "grad_norm": 0.06492228459438584, "language_loss": 0.82168889, "learning_rate": 0.0008123708325995172, "loss": 0.83326268, "num_input_tokens_seen": 131937584, "router_z_loss_mlp": 0.1361084, "routerloss_mlp": 0.0, "step": 1594, "time_per_iteration": 3.193125009536743 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01139509, "balance_loss_mlp": 1.1256932, "diversity_loss_mlp": 0.0, "epoch": 0.30684878799538284, "flos": 758319335424.0, "grad_norm": 0.06515151231920442, "language_loss": 0.79815221, "learning_rate": 0.0008121275099254414, "loss": 0.80954736, "num_input_tokens_seen": 132012656, "router_z_loss_mlp": 0.13830566, "routerloss_mlp": 0.0, "step": 1595, "time_per_iteration": 2.9032304286956787 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01133663, "balance_loss_mlp": 1.12007284, "diversity_loss_mlp": 0.0, "epoch": 0.3070411696806464, "flos": 517574790144.0, "grad_norm": 0.06899315915000012, "language_loss": 0.88638222, "learning_rate": 0.0008118840660727194, "loss": 0.89771879, "num_input_tokens_seen": 132083728, "router_z_loss_mlp": 0.13592529, "routerloss_mlp": 0.0, "step": 1596, "time_per_iteration": 2.6298515796661377 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115466, "balance_loss_mlp": 1.10215056, "diversity_loss_mlp": 0.0, "epoch": 0.30723355136590996, "flos": 844264207872.0, "grad_norm": 0.06984166924665287, "language_loss": 0.87847084, "learning_rate": 0.0008116405011358644, "loss": 0.88962543, "num_input_tokens_seen": 132170896, "router_z_loss_mlp": 0.13336182, "routerloss_mlp": 0.0, "step": 1597, "time_per_iteration": 3.1922342777252197 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095988, "balance_loss_mlp": 1.08212388, "diversity_loss_mlp": 0.0, "epoch": 0.30742593305117355, "flos": 466139722752.0, "grad_norm": 0.07145022695402857, "language_loss": 0.79985273, "learning_rate": 0.0008113968152094369, "loss": 0.81081259, "num_input_tokens_seen": 132234592, "router_z_loss_mlp": 0.13879395, "routerloss_mlp": 0.0, "step": 1598, "time_per_iteration": 2.500500440597534 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090097, "balance_loss_mlp": 1.07637632, "diversity_loss_mlp": 0.0, "epoch": 0.3076183147364371, "flos": 686591354880.0, "grad_norm": 0.07896733537507578, "language_loss": 0.82477671, "learning_rate": 0.0008111530083880438, "loss": 0.83567768, "num_input_tokens_seen": 132314720, "router_z_loss_mlp": 0.13720703, "routerloss_mlp": 0.0, "step": 1599, "time_per_iteration": 2.9081485271453857 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090814, "balance_loss_mlp": 1.07693791, "diversity_loss_mlp": 0.0, "epoch": 0.30781069642170067, "flos": 614018340864.0, "grad_norm": 0.10700735308097704, "language_loss": 0.86289096, "learning_rate": 0.0008109090807663399, "loss": 0.87379909, "num_input_tokens_seen": 132388768, "router_z_loss_mlp": 0.13903809, "routerloss_mlp": 0.0, "step": 1600, "time_per_iteration": 2.7883458137512207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084832, "balance_loss_mlp": 1.07049167, "diversity_loss_mlp": 0.0, "epoch": 0.3080030781069642, "flos": 590318129664.0, "grad_norm": 0.058046583591585654, "language_loss": 0.8845669, "learning_rate": 0.0008106650324390257, "loss": 0.89541531, "num_input_tokens_seen": 132472544, "router_z_loss_mlp": 0.14331055, "routerloss_mlp": 0.0, "step": 1601, "time_per_iteration": 2.8250818252563477 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01012306, "balance_loss_mlp": 1.78856134, "diversity_loss_mlp": 0.20302816, "epoch": 0.3081954597922278, "flos": 562620349440.0, "grad_norm": 0.03151963489439222, "language_loss": 0.81347358, "learning_rate": 0.0008104208635008493, "loss": 0.8235966, "num_input_tokens_seen": 132541968, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0165114, "step": 1602, "time_per_iteration": 2.6824991703033447 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078191, "balance_loss_mlp": 1.06365991, "diversity_loss_mlp": 0.0, "epoch": 0.3083878414774913, "flos": 447830913024.0, "grad_norm": 0.06925842581040223, "language_loss": 0.81696957, "learning_rate": 0.0008101765740466058, "loss": 0.82775152, "num_input_tokens_seen": 132606976, "router_z_loss_mlp": 0.1451416, "routerloss_mlp": 0.0, "step": 1603, "time_per_iteration": 2.4828884601593018 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083153, "balance_loss_mlp": 1.06891942, "diversity_loss_mlp": 0.0, "epoch": 0.3085802231627549, "flos": 493546037760.0, "grad_norm": 0.08194523431430376, "language_loss": 0.83996522, "learning_rate": 0.0008099321641711364, "loss": 0.85079676, "num_input_tokens_seen": 132677984, "router_z_loss_mlp": 0.14221191, "routerloss_mlp": 0.0, "step": 1604, "time_per_iteration": 2.628990650177002 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01093302, "balance_loss_mlp": 1.07891393, "diversity_loss_mlp": 0.0, "epoch": 0.3087726048480185, "flos": 487687703040.0, "grad_norm": 0.066381842407901, "language_loss": 0.83568424, "learning_rate": 0.0008096876339693295, "loss": 0.84661728, "num_input_tokens_seen": 132749136, "router_z_loss_mlp": 0.14367676, "routerloss_mlp": 0.0, "step": 1605, "time_per_iteration": 2.621486186981201 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01104017, "balance_loss_mlp": 1.0898906, "diversity_loss_mlp": 0.0, "epoch": 0.308964986533282, "flos": 730589248512.0, "grad_norm": 0.08065648415588843, "language_loss": 0.8146233, "learning_rate": 0.0008094429835361206, "loss": 0.82566357, "num_input_tokens_seen": 132823824, "router_z_loss_mlp": 0.14135742, "routerloss_mlp": 0.0, "step": 1606, "time_per_iteration": 2.9436137676239014 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101821, "balance_loss_mlp": 1.08727765, "diversity_loss_mlp": 0.0, "epoch": 0.3091573682185456, "flos": 605407592448.0, "grad_norm": 0.06722603246449312, "language_loss": 0.85730284, "learning_rate": 0.0008091982129664908, "loss": 0.86832106, "num_input_tokens_seen": 132895936, "router_z_loss_mlp": 0.14538574, "routerloss_mlp": 0.0, "step": 1607, "time_per_iteration": 2.6776270866394043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110298, "balance_loss_mlp": 1.09606481, "diversity_loss_mlp": 0.0, "epoch": 0.30934974990380915, "flos": 460325804544.0, "grad_norm": 0.07435522574008574, "language_loss": 0.83177197, "learning_rate": 0.0008089533223554687, "loss": 0.842875, "num_input_tokens_seen": 132968960, "router_z_loss_mlp": 0.14257812, "routerloss_mlp": 0.0, "step": 1608, "time_per_iteration": 2.6971724033355713 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106883, "balance_loss_mlp": 1.09322155, "diversity_loss_mlp": 0.0, "epoch": 0.30954213158907273, "flos": 553426297344.0, "grad_norm": 0.08534881839400792, "language_loss": 0.85436511, "learning_rate": 0.0008087083117981294, "loss": 0.86543399, "num_input_tokens_seen": 133048448, "router_z_loss_mlp": 0.13684082, "routerloss_mlp": 0.0, "step": 1609, "time_per_iteration": 2.873072624206543 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100887, "balance_loss_mlp": 1.08715367, "diversity_loss_mlp": 0.0, "epoch": 0.30973451327433627, "flos": 553043427840.0, "grad_norm": 0.08408730625442483, "language_loss": 0.88209295, "learning_rate": 0.0008084631813895943, "loss": 0.89310181, "num_input_tokens_seen": 133121680, "router_z_loss_mlp": 0.13745117, "routerloss_mlp": 0.0, "step": 1610, "time_per_iteration": 2.7717368602752686 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098096, "balance_loss_mlp": 1.0843389, "diversity_loss_mlp": 0.0, "epoch": 0.30992689495959985, "flos": 565696535040.0, "grad_norm": 0.07291880748627809, "language_loss": 0.84093356, "learning_rate": 0.0008082179312250315, "loss": 0.85191453, "num_input_tokens_seen": 133190176, "router_z_loss_mlp": 0.13769531, "routerloss_mlp": 0.0, "step": 1611, "time_per_iteration": 2.6323728561401367 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01167376, "balance_loss_mlp": 1.15912676, "diversity_loss_mlp": 0.0, "epoch": 0.3101192766448634, "flos": 1442406776832.0, "grad_norm": 0.06715325583723679, "language_loss": 0.79855847, "learning_rate": 0.0008079725613996555, "loss": 0.81023216, "num_input_tokens_seen": 133420512, "router_z_loss_mlp": 0.08251953, "routerloss_mlp": 0.0, "step": 1612, "time_per_iteration": 4.837978839874268 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103787, "balance_loss_mlp": 1.09591889, "diversity_loss_mlp": 0.0, "epoch": 0.31031165833012697, "flos": 1531892570112.0, "grad_norm": 0.04843806861709949, "language_loss": 0.76629329, "learning_rate": 0.0008077270720087273, "loss": 0.77733123, "num_input_tokens_seen": 133651984, "router_z_loss_mlp": 0.07861328, "routerloss_mlp": 0.0, "step": 1613, "time_per_iteration": 5.086154937744141 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118625, "balance_loss_mlp": 1.10497594, "diversity_loss_mlp": 0.0, "epoch": 0.31050404001539056, "flos": 991952676864.0, "grad_norm": 0.09649046421891638, "language_loss": 0.82414234, "learning_rate": 0.0008074814631475545, "loss": 0.83532858, "num_input_tokens_seen": 133741648, "router_z_loss_mlp": 0.13671875, "routerloss_mlp": 0.0, "step": 1614, "time_per_iteration": 3.3300058841705322 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115901, "balance_loss_mlp": 1.10232294, "diversity_loss_mlp": 0.0, "epoch": 0.3106964217006541, "flos": 445992542208.0, "grad_norm": 0.10381126956618623, "language_loss": 0.7917223, "learning_rate": 0.0008072357349114907, "loss": 0.80288124, "num_input_tokens_seen": 133813344, "router_z_loss_mlp": 0.1361084, "routerloss_mlp": 0.0, "step": 1615, "time_per_iteration": 2.692242383956909 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01123449, "balance_loss_mlp": 1.1100384, "diversity_loss_mlp": 0.0, "epoch": 0.3108888033859177, "flos": 510505804800.0, "grad_norm": 0.09811598085954727, "language_loss": 0.88751173, "learning_rate": 0.0008069898873959363, "loss": 0.89874619, "num_input_tokens_seen": 133884192, "router_z_loss_mlp": 0.13439941, "routerloss_mlp": 0.0, "step": 1616, "time_per_iteration": 2.688138723373413 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119599, "balance_loss_mlp": 1.10590243, "diversity_loss_mlp": 0.0, "epoch": 0.3110811850711812, "flos": 520732468224.0, "grad_norm": 0.06496922585492992, "language_loss": 0.85670269, "learning_rate": 0.0008067439206963375, "loss": 0.8678987, "num_input_tokens_seen": 133954848, "router_z_loss_mlp": 0.13684082, "routerloss_mlp": 0.0, "step": 1617, "time_per_iteration": 2.628465175628662 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126727, "balance_loss_mlp": 1.11359048, "diversity_loss_mlp": 0.0, "epoch": 0.3112735667564448, "flos": 686413315584.0, "grad_norm": 0.08367367493581554, "language_loss": 0.86233091, "learning_rate": 0.0008064978349081873, "loss": 0.87359822, "num_input_tokens_seen": 134031824, "router_z_loss_mlp": 0.13146973, "routerloss_mlp": 0.0, "step": 1618, "time_per_iteration": 2.9359195232391357 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01122791, "balance_loss_mlp": 1.10941529, "diversity_loss_mlp": 0.0, "epoch": 0.31146594844170833, "flos": 533061803520.0, "grad_norm": 0.062058920213391884, "language_loss": 0.86742592, "learning_rate": 0.0008062516301270245, "loss": 0.87865382, "num_input_tokens_seen": 134104480, "router_z_loss_mlp": 0.1338501, "routerloss_mlp": 0.0, "step": 1619, "time_per_iteration": 2.685615301132202 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00968061, "balance_loss_mlp": 1.70987701, "diversity_loss_mlp": 0.19448289, "epoch": 0.3116583301269719, "flos": 679517227008.0, "grad_norm": 0.02692656797073588, "language_loss": 0.8831743, "learning_rate": 0.0008060053064484343, "loss": 0.89285493, "num_input_tokens_seen": 134185632, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01588114, "step": 1620, "time_per_iteration": 2.9507076740264893 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01131577, "balance_loss_mlp": 1.11839283, "diversity_loss_mlp": 0.0, "epoch": 0.31185071181223545, "flos": 586149861888.0, "grad_norm": 0.08216719715750098, "language_loss": 0.85142976, "learning_rate": 0.0008057588639680482, "loss": 0.86274558, "num_input_tokens_seen": 134261600, "router_z_loss_mlp": 0.13208008, "routerloss_mlp": 0.0, "step": 1621, "time_per_iteration": 2.7498936653137207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00955916, "balance_loss_mlp": 1.68915153, "diversity_loss_mlp": 0.19115068, "epoch": 0.31204309349749904, "flos": 725403050496.0, "grad_norm": 0.038673577194741904, "language_loss": 0.82934028, "learning_rate": 0.0008055123027815434, "loss": 0.83889943, "num_input_tokens_seen": 134334368, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01576493, "step": 1622, "time_per_iteration": 2.92877459526062 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119016, "balance_loss_mlp": 1.10545552, "diversity_loss_mlp": 0.0, "epoch": 0.3122354751827626, "flos": 576825131520.0, "grad_norm": 0.11144773799130939, "language_loss": 0.8492527, "learning_rate": 0.0008052656229846436, "loss": 0.86044282, "num_input_tokens_seen": 134403824, "router_z_loss_mlp": 0.13580322, "routerloss_mlp": 0.0, "step": 1623, "time_per_iteration": 2.6647849082946777 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01104342, "balance_loss_mlp": 1.09039474, "diversity_loss_mlp": 0.0, "epoch": 0.31242785686802615, "flos": 575943022080.0, "grad_norm": 0.09067734621983937, "language_loss": 0.90320027, "learning_rate": 0.0008050188246731182, "loss": 0.9142437, "num_input_tokens_seen": 134471296, "router_z_loss_mlp": 0.13964844, "routerloss_mlp": 0.0, "step": 1624, "time_per_iteration": 2.6908931732177734 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108727, "balance_loss_mlp": 1.07360816, "diversity_loss_mlp": 0.0, "epoch": 0.31262023855328974, "flos": 736830452736.0, "grad_norm": 0.08706559573327896, "language_loss": 0.8222695, "learning_rate": 0.0008047719079427834, "loss": 0.83314216, "num_input_tokens_seen": 134551360, "router_z_loss_mlp": 0.13684082, "routerloss_mlp": 0.0, "step": 1625, "time_per_iteration": 2.979578733444214 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01281481, "balance_loss_mlp": 1.27170551, "diversity_loss_mlp": 0.0, "epoch": 0.3128126202385533, "flos": 1559232073728.0, "grad_norm": 0.09241126848133228, "language_loss": 0.74351704, "learning_rate": 0.0008045248728895, "loss": 0.75633186, "num_input_tokens_seen": 134761328, "router_z_loss_mlp": 0.09765625, "routerloss_mlp": 0.0, "step": 1626, "time_per_iteration": 4.813723802566528 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078597, "balance_loss_mlp": 1.06489933, "diversity_loss_mlp": 0.0, "epoch": 0.31300500192381686, "flos": 514921121280.0, "grad_norm": 0.061158387019755324, "language_loss": 0.86164916, "learning_rate": 0.0008042777196091757, "loss": 0.87243509, "num_input_tokens_seen": 134833136, "router_z_loss_mlp": 0.13708496, "routerloss_mlp": 0.0, "step": 1627, "time_per_iteration": 2.6777052879333496 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00931263, "balance_loss_mlp": 1.63595629, "diversity_loss_mlp": 0.19502082, "epoch": 0.3131973836090804, "flos": 526627878912.0, "grad_norm": 0.02888255305303151, "language_loss": 0.81839561, "learning_rate": 0.0008040304481977643, "loss": 0.82770824, "num_input_tokens_seen": 134904352, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01577434, "step": 1628, "time_per_iteration": 2.685519218444824 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083762, "balance_loss_mlp": 1.07024312, "diversity_loss_mlp": 0.0, "epoch": 0.313389765294344, "flos": 822820114944.0, "grad_norm": 0.070875243316129, "language_loss": 0.86462033, "learning_rate": 0.0008037830587512649, "loss": 0.875458, "num_input_tokens_seen": 134984880, "router_z_loss_mlp": 0.13537598, "routerloss_mlp": 0.0, "step": 1629, "time_per_iteration": 3.0812296867370605 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01093655, "balance_loss_mlp": 1.07976675, "diversity_loss_mlp": 0.0, "epoch": 0.31358214697960757, "flos": 393823669248.0, "grad_norm": 0.07857424850498267, "language_loss": 0.78910959, "learning_rate": 0.0008035355513657224, "loss": 0.80004621, "num_input_tokens_seen": 135047456, "router_z_loss_mlp": 0.13903809, "routerloss_mlp": 0.0, "step": 1630, "time_per_iteration": 2.509866714477539 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109932, "balance_loss_mlp": 1.08518136, "diversity_loss_mlp": 0.0, "epoch": 0.3137745286648711, "flos": 571908003840.0, "grad_norm": 0.05926482463995905, "language_loss": 0.9323386, "learning_rate": 0.0008032879261372279, "loss": 0.94333184, "num_input_tokens_seen": 135124256, "router_z_loss_mlp": 0.14135742, "routerloss_mlp": 0.0, "step": 1631, "time_per_iteration": 2.793675422668457 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0121244, "balance_loss_mlp": 1.20142555, "diversity_loss_mlp": 0.0, "epoch": 0.3139669103501347, "flos": 1498415376384.0, "grad_norm": 0.0543299042148954, "language_loss": 0.79635841, "learning_rate": 0.0008030401831619178, "loss": 0.80848283, "num_input_tokens_seen": 135353024, "router_z_loss_mlp": 0.11035156, "routerloss_mlp": 0.0, "step": 1632, "time_per_iteration": 5.6717705726623535 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100999, "balance_loss_mlp": 1.08712876, "diversity_loss_mlp": 0.0, "epoch": 0.3141592920353982, "flos": 525343076352.0, "grad_norm": 0.07399367926820971, "language_loss": 0.87236691, "learning_rate": 0.0008027923225359748, "loss": 0.88337696, "num_input_tokens_seen": 135422464, "router_z_loss_mlp": 0.13885498, "routerloss_mlp": 0.0, "step": 1633, "time_per_iteration": 2.591161012649536 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01107353, "balance_loss_mlp": 1.09272563, "diversity_loss_mlp": 0.0, "epoch": 0.3143516737206618, "flos": 593268406272.0, "grad_norm": 0.07361205381971474, "language_loss": 0.8823992, "learning_rate": 0.0008025443443556267, "loss": 0.89347273, "num_input_tokens_seen": 135490928, "router_z_loss_mlp": 0.1463623, "routerloss_mlp": 0.0, "step": 1634, "time_per_iteration": 2.714925765991211 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106628, "balance_loss_mlp": 1.09279966, "diversity_loss_mlp": 0.0, "epoch": 0.31454405540592534, "flos": 648362589696.0, "grad_norm": 0.05821338652647348, "language_loss": 0.88174599, "learning_rate": 0.000802296248717147, "loss": 0.89281231, "num_input_tokens_seen": 135576288, "router_z_loss_mlp": 0.1385498, "routerloss_mlp": 0.0, "step": 1635, "time_per_iteration": 2.924661159515381 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01102803, "balance_loss_mlp": 1.08889091, "diversity_loss_mlp": 0.0, "epoch": 0.3147364370911889, "flos": 642847850496.0, "grad_norm": 0.06918051977022115, "language_loss": 0.78766519, "learning_rate": 0.0008020480357168554, "loss": 0.79869324, "num_input_tokens_seen": 135652320, "router_z_loss_mlp": 0.13928223, "routerloss_mlp": 0.0, "step": 1636, "time_per_iteration": 2.8397598266601562 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01096954, "balance_loss_mlp": 1.08334041, "diversity_loss_mlp": 0.0, "epoch": 0.31492881877645246, "flos": 471849753600.0, "grad_norm": 0.061070409346790804, "language_loss": 0.88343245, "learning_rate": 0.0008017997054511165, "loss": 0.89440191, "num_input_tokens_seen": 135719632, "router_z_loss_mlp": 0.13623047, "routerloss_mlp": 0.0, "step": 1637, "time_per_iteration": 2.5770463943481445 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109567, "balance_loss_mlp": 1.08241367, "diversity_loss_mlp": 0.0, "epoch": 0.31512120046171604, "flos": 629433773568.0, "grad_norm": 0.06082888573267997, "language_loss": 0.85688329, "learning_rate": 0.0008015512580163407, "loss": 0.86783999, "num_input_tokens_seen": 135796544, "router_z_loss_mlp": 0.1328125, "routerloss_mlp": 0.0, "step": 1638, "time_per_iteration": 2.7893900871276855 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00915347, "balance_loss_mlp": 1.6005652, "diversity_loss_mlp": 0.19760543, "epoch": 0.31531358214697963, "flos": 703778347008.0, "grad_norm": 0.03200753828687725, "language_loss": 0.80247211, "learning_rate": 0.0008013026935089838, "loss": 0.8116256, "num_input_tokens_seen": 135871344, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0162621, "step": 1639, "time_per_iteration": 2.9013028144836426 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01116887, "balance_loss_mlp": 1.10366678, "diversity_loss_mlp": 0.0, "epoch": 0.31550596383224316, "flos": 572545635840.0, "grad_norm": 0.07107229367788748, "language_loss": 0.84156835, "learning_rate": 0.0008010540120255472, "loss": 0.85273731, "num_input_tokens_seen": 135944320, "router_z_loss_mlp": 0.13232422, "routerloss_mlp": 0.0, "step": 1640, "time_per_iteration": 2.6617894172668457 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01122905, "balance_loss_mlp": 1.10991144, "diversity_loss_mlp": 0.0, "epoch": 0.31569834551750675, "flos": 658340006400.0, "grad_norm": 0.08316081918757003, "language_loss": 0.86058956, "learning_rate": 0.0008008052136625774, "loss": 0.87181866, "num_input_tokens_seen": 136019456, "router_z_loss_mlp": 0.13006592, "routerloss_mlp": 0.0, "step": 1641, "time_per_iteration": 2.8128581047058105 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117766, "balance_loss_mlp": 1.10461712, "diversity_loss_mlp": 0.0, "epoch": 0.3158907272027703, "flos": 566282036736.0, "grad_norm": 0.11340060957388516, "language_loss": 0.86898887, "learning_rate": 0.0008005562985166666, "loss": 0.88016647, "num_input_tokens_seen": 136091232, "router_z_loss_mlp": 0.13165283, "routerloss_mlp": 0.0, "step": 1642, "time_per_iteration": 2.6915791034698486 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113412, "balance_loss_mlp": 1.10045385, "diversity_loss_mlp": 0.0, "epoch": 0.31608310888803387, "flos": 536891618304.0, "grad_norm": 0.06371803301806024, "language_loss": 0.85065734, "learning_rate": 0.0008003072666844524, "loss": 0.86179143, "num_input_tokens_seen": 136165088, "router_z_loss_mlp": 0.12976074, "routerloss_mlp": 0.0, "step": 1643, "time_per_iteration": 2.713515520095825 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110554, "balance_loss_mlp": 1.09287417, "diversity_loss_mlp": 0.0, "epoch": 0.3162754905732974, "flos": 486669772800.0, "grad_norm": 0.09207812275617455, "language_loss": 0.82446098, "learning_rate": 0.0008000581182626173, "loss": 0.83551639, "num_input_tokens_seen": 136230368, "router_z_loss_mlp": 0.12670898, "routerloss_mlp": 0.0, "step": 1644, "time_per_iteration": 2.5728507041931152 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099382, "balance_loss_mlp": 1.08668065, "diversity_loss_mlp": 0.0, "epoch": 0.316467872258561, "flos": 530052429312.0, "grad_norm": 0.07446065392993936, "language_loss": 0.86341298, "learning_rate": 0.0007998088533478894, "loss": 0.87440687, "num_input_tokens_seen": 136302512, "router_z_loss_mlp": 0.12713623, "routerloss_mlp": 0.0, "step": 1645, "time_per_iteration": 2.7022316455841064 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103676, "balance_loss_mlp": 1.09096265, "diversity_loss_mlp": 0.0, "epoch": 0.3166602539438245, "flos": 443440189440.0, "grad_norm": 0.09512310951915111, "language_loss": 0.84171218, "learning_rate": 0.000799559472037042, "loss": 0.85274899, "num_input_tokens_seen": 136368064, "router_z_loss_mlp": 0.12719727, "routerloss_mlp": 0.0, "step": 1646, "time_per_iteration": 2.5341672897338867 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089286, "balance_loss_mlp": 1.07678151, "diversity_loss_mlp": 0.0, "epoch": 0.3168526356290881, "flos": 645830060544.0, "grad_norm": 0.05690135295492242, "language_loss": 0.87462902, "learning_rate": 0.0007993099744268932, "loss": 0.88552189, "num_input_tokens_seen": 136451520, "router_z_loss_mlp": 0.12506104, "routerloss_mlp": 0.0, "step": 1647, "time_per_iteration": 2.9204719066619873 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097973, "balance_loss_mlp": 1.08491409, "diversity_loss_mlp": 0.0, "epoch": 0.3170450173143517, "flos": 586162344960.0, "grad_norm": 0.08028992569563033, "language_loss": 0.88103539, "learning_rate": 0.000799060360614307, "loss": 0.8920151, "num_input_tokens_seen": 136521184, "router_z_loss_mlp": 0.13079834, "routerloss_mlp": 0.0, "step": 1648, "time_per_iteration": 2.7098584175109863 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094994, "balance_loss_mlp": 1.08204746, "diversity_loss_mlp": 0.0, "epoch": 0.3172373989996152, "flos": 827124203520.0, "grad_norm": 0.07374581447427947, "language_loss": 0.83565277, "learning_rate": 0.0007988106306961917, "loss": 0.84660268, "num_input_tokens_seen": 136612592, "router_z_loss_mlp": 0.12963867, "routerloss_mlp": 0.0, "step": 1649, "time_per_iteration": 3.136148691177368 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01096074, "balance_loss_mlp": 1.08292556, "diversity_loss_mlp": 0.0, "epoch": 0.3174297806848788, "flos": 527408672256.0, "grad_norm": 0.08307651310008923, "language_loss": 0.84510154, "learning_rate": 0.0007985607847695014, "loss": 0.85606229, "num_input_tokens_seen": 136684336, "router_z_loss_mlp": 0.13171387, "routerloss_mlp": 0.0, "step": 1650, "time_per_iteration": 2.6657865047454834 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090136, "balance_loss_mlp": 1.07697558, "diversity_loss_mlp": 0.0, "epoch": 0.31762216237014235, "flos": 713179800576.0, "grad_norm": 0.07221907468491222, "language_loss": 0.82981718, "learning_rate": 0.0007983108229312345, "loss": 0.84071863, "num_input_tokens_seen": 136766400, "router_z_loss_mlp": 0.13183594, "routerloss_mlp": 0.0, "step": 1651, "time_per_iteration": 2.939943313598633 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109006, "balance_loss_mlp": 1.07648206, "diversity_loss_mlp": 0.0, "epoch": 0.31781454405540593, "flos": 483813471744.0, "grad_norm": 0.0785368607999539, "language_loss": 0.86505926, "learning_rate": 0.0007980607452784351, "loss": 0.87595987, "num_input_tokens_seen": 136834016, "router_z_loss_mlp": 0.13598633, "routerloss_mlp": 0.0, "step": 1652, "time_per_iteration": 2.586700916290283 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082675, "balance_loss_mlp": 1.06952596, "diversity_loss_mlp": 0.0, "epoch": 0.31800692574066947, "flos": 548746679808.0, "grad_norm": 0.06920593361186494, "language_loss": 0.90510356, "learning_rate": 0.0007978105519081919, "loss": 0.91593033, "num_input_tokens_seen": 136906288, "router_z_loss_mlp": 0.13165283, "routerloss_mlp": 0.0, "step": 1653, "time_per_iteration": 2.665844440460205 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084984, "balance_loss_mlp": 1.0715965, "diversity_loss_mlp": 0.0, "epoch": 0.31819930742593305, "flos": 516895312896.0, "grad_norm": 0.07269169213621761, "language_loss": 0.87967515, "learning_rate": 0.0007975602429176385, "loss": 0.89052504, "num_input_tokens_seen": 136972416, "router_z_loss_mlp": 0.13415527, "routerloss_mlp": 0.0, "step": 1654, "time_per_iteration": 2.5818393230438232 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085975, "balance_loss_mlp": 1.07225442, "diversity_loss_mlp": 0.0, "epoch": 0.31839168911119664, "flos": 455991980544.0, "grad_norm": 0.08150423110047789, "language_loss": 0.81308222, "learning_rate": 0.0007973098184039536, "loss": 0.82394195, "num_input_tokens_seen": 137044576, "router_z_loss_mlp": 0.13757324, "routerloss_mlp": 0.0, "step": 1655, "time_per_iteration": 2.664916515350342 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094509, "balance_loss_mlp": 1.08110952, "diversity_loss_mlp": 0.0, "epoch": 0.3185840707964602, "flos": 626033816064.0, "grad_norm": 0.0661968945841423, "language_loss": 0.8695243, "learning_rate": 0.0007970592784643602, "loss": 0.88046944, "num_input_tokens_seen": 137125120, "router_z_loss_mlp": 0.13427734, "routerloss_mlp": 0.0, "step": 1656, "time_per_iteration": 2.851214647293091 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01104427, "balance_loss_mlp": 1.09084868, "diversity_loss_mlp": 0.0, "epoch": 0.31877645248172376, "flos": 567478006272.0, "grad_norm": 0.0809768283097012, "language_loss": 0.85228848, "learning_rate": 0.0007968086231961272, "loss": 0.86333275, "num_input_tokens_seen": 137195344, "router_z_loss_mlp": 0.13598633, "routerloss_mlp": 0.0, "step": 1657, "time_per_iteration": 2.6277201175689697 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111168, "balance_loss_mlp": 1.09744644, "diversity_loss_mlp": 0.0, "epoch": 0.3189688341669873, "flos": 489580402176.0, "grad_norm": 0.10999441213252201, "language_loss": 0.83322126, "learning_rate": 0.0007965578526965671, "loss": 0.84433806, "num_input_tokens_seen": 137261040, "router_z_loss_mlp": 0.14245605, "routerloss_mlp": 0.0, "step": 1658, "time_per_iteration": 2.5514447689056396 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097556, "balance_loss_mlp": 1.08337009, "diversity_loss_mlp": 0.0, "epoch": 0.3191612158522509, "flos": 576234487296.0, "grad_norm": 0.07090711515760839, "language_loss": 0.86299932, "learning_rate": 0.0007963069670630377, "loss": 0.87397492, "num_input_tokens_seen": 137334400, "router_z_loss_mlp": 0.1418457, "routerloss_mlp": 0.0, "step": 1659, "time_per_iteration": 2.722572088241577 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108941, "balance_loss_mlp": 1.07523549, "diversity_loss_mlp": 0.0, "epoch": 0.3193535975375144, "flos": 538132004352.0, "grad_norm": 0.07181055202596492, "language_loss": 0.88127738, "learning_rate": 0.0007960559663929416, "loss": 0.8921715, "num_input_tokens_seen": 137405344, "router_z_loss_mlp": 0.1418457, "routerloss_mlp": 0.0, "step": 1660, "time_per_iteration": 2.6411688327789307 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079317, "balance_loss_mlp": 1.06500006, "diversity_loss_mlp": 0.0, "epoch": 0.319545979222778, "flos": 734288011776.0, "grad_norm": 0.06614466369263741, "language_loss": 0.87915826, "learning_rate": 0.0007958048507837259, "loss": 0.88995141, "num_input_tokens_seen": 137486016, "router_z_loss_mlp": 0.14318848, "routerloss_mlp": 0.0, "step": 1661, "time_per_iteration": 2.954888343811035 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075627, "balance_loss_mlp": 1.06107187, "diversity_loss_mlp": 0.0, "epoch": 0.31973836090804153, "flos": 764461794816.0, "grad_norm": 0.08599761261652404, "language_loss": 0.87309289, "learning_rate": 0.0007955536203328822, "loss": 0.88384914, "num_input_tokens_seen": 137562304, "router_z_loss_mlp": 0.14562988, "routerloss_mlp": 0.0, "step": 1662, "time_per_iteration": 2.9499282836914062 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074811, "balance_loss_mlp": 1.06073272, "diversity_loss_mlp": 0.0, "epoch": 0.3199307425933051, "flos": 560549611008.0, "grad_norm": 0.08962386225204486, "language_loss": 0.8334958, "learning_rate": 0.0007953022751379469, "loss": 0.84424388, "num_input_tokens_seen": 137639248, "router_z_loss_mlp": 0.140625, "routerloss_mlp": 0.0, "step": 1663, "time_per_iteration": 2.768754005432129 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075722, "balance_loss_mlp": 1.06131005, "diversity_loss_mlp": 0.0, "epoch": 0.3201231242785687, "flos": 751349094912.0, "grad_norm": 0.08182948291647181, "language_loss": 0.8200748, "learning_rate": 0.000795050815296501, "loss": 0.830832, "num_input_tokens_seen": 137718256, "router_z_loss_mlp": 0.14416504, "routerloss_mlp": 0.0, "step": 1664, "time_per_iteration": 2.9893014430999756 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084976, "balance_loss_mlp": 1.07167196, "diversity_loss_mlp": 0.0, "epoch": 0.32031550596383224, "flos": 496402338816.0, "grad_norm": 0.0641722272838546, "language_loss": 0.93037909, "learning_rate": 0.0007947992409061695, "loss": 0.94122881, "num_input_tokens_seen": 137785216, "router_z_loss_mlp": 0.13330078, "routerloss_mlp": 0.0, "step": 1665, "time_per_iteration": 2.583789110183716 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100063, "balance_loss_mlp": 1.08662808, "diversity_loss_mlp": 0.0, "epoch": 0.3205078876490958, "flos": 731609750016.0, "grad_norm": 0.07388769827525307, "language_loss": 0.86501724, "learning_rate": 0.0007945475520646226, "loss": 0.87601787, "num_input_tokens_seen": 137863424, "router_z_loss_mlp": 0.13464355, "routerloss_mlp": 0.0, "step": 1666, "time_per_iteration": 2.944988965988159 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01127031, "balance_loss_mlp": 1.11408508, "diversity_loss_mlp": 0.0, "epoch": 0.32070026933435936, "flos": 549436068864.0, "grad_norm": 0.0781321549049884, "language_loss": 0.84777099, "learning_rate": 0.0007942957488695743, "loss": 0.85904133, "num_input_tokens_seen": 137930384, "router_z_loss_mlp": 0.12957764, "routerloss_mlp": 0.0, "step": 1667, "time_per_iteration": 2.667464017868042 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138299, "balance_loss_mlp": 1.12505507, "diversity_loss_mlp": 0.0, "epoch": 0.32089265101962294, "flos": 745295468544.0, "grad_norm": 0.06588913292879497, "language_loss": 0.81000018, "learning_rate": 0.0007940438314187833, "loss": 0.82138324, "num_input_tokens_seen": 138017200, "router_z_loss_mlp": 0.13250732, "routerloss_mlp": 0.0, "step": 1668, "time_per_iteration": 3.0395359992980957 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01147791, "balance_loss_mlp": 1.13491094, "diversity_loss_mlp": 0.0, "epoch": 0.3210850327048865, "flos": 494188439040.0, "grad_norm": 0.07621602089938284, "language_loss": 0.80540276, "learning_rate": 0.0007937917998100529, "loss": 0.8168807, "num_input_tokens_seen": 138084048, "router_z_loss_mlp": 0.12896729, "routerloss_mlp": 0.0, "step": 1669, "time_per_iteration": 2.5894687175750732 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01142528, "balance_loss_mlp": 1.1294744, "diversity_loss_mlp": 0.0, "epoch": 0.32127741439015006, "flos": 530640502272.0, "grad_norm": 0.07981389159152626, "language_loss": 0.79167509, "learning_rate": 0.0007935396541412302, "loss": 0.80310035, "num_input_tokens_seen": 138153280, "router_z_loss_mlp": 0.13067627, "routerloss_mlp": 0.0, "step": 1670, "time_per_iteration": 2.672978401184082 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01141245, "balance_loss_mlp": 1.12813175, "diversity_loss_mlp": 0.0, "epoch": 0.3214697960754136, "flos": 501203096064.0, "grad_norm": 0.06899314705075654, "language_loss": 0.85712755, "learning_rate": 0.0007932873945102068, "loss": 0.86854005, "num_input_tokens_seen": 138222320, "router_z_loss_mlp": 0.13128662, "routerloss_mlp": 0.0, "step": 1671, "time_per_iteration": 2.6296515464782715 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01272088, "balance_loss_mlp": 1.25616145, "diversity_loss_mlp": 0.0, "epoch": 0.3216621777606772, "flos": 1383341815296.0, "grad_norm": 0.05047573422440889, "language_loss": 0.75761777, "learning_rate": 0.0007930350210149188, "loss": 0.77033865, "num_input_tokens_seen": 138449488, "router_z_loss_mlp": 0.15917969, "routerloss_mlp": 0.0, "step": 1672, "time_per_iteration": 4.840561628341675 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138556, "balance_loss_mlp": 1.1251744, "diversity_loss_mlp": 0.0, "epoch": 0.32185455944594077, "flos": 571535046144.0, "grad_norm": 0.06902528499394482, "language_loss": 0.86527705, "learning_rate": 0.0007927825337533461, "loss": 0.87666261, "num_input_tokens_seen": 138522496, "router_z_loss_mlp": 0.1338501, "routerloss_mlp": 0.0, "step": 1673, "time_per_iteration": 2.693758964538574 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01142697, "balance_loss_mlp": 1.12930942, "diversity_loss_mlp": 0.0, "epoch": 0.3220469411312043, "flos": 543908846592.0, "grad_norm": 0.08521571565711833, "language_loss": 0.84877092, "learning_rate": 0.0007925299328235131, "loss": 0.8601979, "num_input_tokens_seen": 138590096, "router_z_loss_mlp": 0.1340332, "routerloss_mlp": 0.0, "step": 1674, "time_per_iteration": 2.659621238708496 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01141943, "balance_loss_mlp": 1.12855613, "diversity_loss_mlp": 0.0, "epoch": 0.3222393228164679, "flos": 491139417600.0, "grad_norm": 0.08187135533898351, "language_loss": 0.84720862, "learning_rate": 0.000792277218323488, "loss": 0.85862803, "num_input_tokens_seen": 138658224, "router_z_loss_mlp": 0.1340332, "routerloss_mlp": 0.0, "step": 1675, "time_per_iteration": 2.646108865737915 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135022, "balance_loss_mlp": 1.12169456, "diversity_loss_mlp": 0.0, "epoch": 0.3224317045017314, "flos": 490388359680.0, "grad_norm": 0.08499328402904442, "language_loss": 0.8509531, "learning_rate": 0.0007920243903513833, "loss": 0.86230332, "num_input_tokens_seen": 138722864, "router_z_loss_mlp": 0.13342285, "routerloss_mlp": 0.0, "step": 1676, "time_per_iteration": 2.5730555057525635 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126699, "balance_loss_mlp": 1.11364567, "diversity_loss_mlp": 0.0, "epoch": 0.322624086186995, "flos": 575777465856.0, "grad_norm": 0.08854342537284099, "language_loss": 0.84008271, "learning_rate": 0.0007917714490053556, "loss": 0.85134971, "num_input_tokens_seen": 138791472, "router_z_loss_mlp": 0.1307373, "routerloss_mlp": 0.0, "step": 1677, "time_per_iteration": 2.718555212020874 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01122958, "balance_loss_mlp": 1.10974979, "diversity_loss_mlp": 0.0, "epoch": 0.32281646787225854, "flos": 629292810240.0, "grad_norm": 0.07711595043056121, "language_loss": 0.86223996, "learning_rate": 0.0007915183943836055, "loss": 0.87346947, "num_input_tokens_seen": 138873424, "router_z_loss_mlp": 0.13220215, "routerloss_mlp": 0.0, "step": 1678, "time_per_iteration": 2.902038812637329 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112402, "balance_loss_mlp": 1.09958673, "diversity_loss_mlp": 0.0, "epoch": 0.3230088495575221, "flos": 781389255168.0, "grad_norm": 0.07762427611918464, "language_loss": 0.8422336, "learning_rate": 0.0007912652265843773, "loss": 0.85335761, "num_input_tokens_seen": 138956880, "router_z_loss_mlp": 0.1282959, "routerloss_mlp": 0.0, "step": 1679, "time_per_iteration": 3.024665117263794 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01107958, "balance_loss_mlp": 1.09453535, "diversity_loss_mlp": 0.0, "epoch": 0.3232012312427857, "flos": 536110824960.0, "grad_norm": 0.06959311244041297, "language_loss": 0.81845474, "learning_rate": 0.0007910119457059597, "loss": 0.82953429, "num_input_tokens_seen": 139031296, "router_z_loss_mlp": 0.13439941, "routerloss_mlp": 0.0, "step": 1680, "time_per_iteration": 2.6954221725463867 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111109, "balance_loss_mlp": 1.09806776, "diversity_loss_mlp": 0.0, "epoch": 0.32339361292804925, "flos": 704857946112.0, "grad_norm": 0.08135634404485692, "language_loss": 0.80380678, "learning_rate": 0.0007907585518466849, "loss": 0.81491786, "num_input_tokens_seen": 139109776, "router_z_loss_mlp": 0.13061523, "routerloss_mlp": 0.0, "step": 1681, "time_per_iteration": 2.961648464202881 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108764, "balance_loss_mlp": 1.09574652, "diversity_loss_mlp": 0.0, "epoch": 0.32358599461331283, "flos": 452330293248.0, "grad_norm": 0.06462126830885603, "language_loss": 0.89670283, "learning_rate": 0.000790505045104929, "loss": 0.90779042, "num_input_tokens_seen": 139174736, "router_z_loss_mlp": 0.13031006, "routerloss_mlp": 0.0, "step": 1682, "time_per_iteration": 2.5210485458374023 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111018, "balance_loss_mlp": 1.09719789, "diversity_loss_mlp": 0.0, "epoch": 0.32377837629857636, "flos": 600895729152.0, "grad_norm": 0.08715930327910015, "language_loss": 0.86719161, "learning_rate": 0.0007902514255791125, "loss": 0.8782934, "num_input_tokens_seen": 139252064, "router_z_loss_mlp": 0.13000488, "routerloss_mlp": 0.0, "step": 1683, "time_per_iteration": 2.8002610206604004 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097901, "balance_loss_mlp": 1.084764, "diversity_loss_mlp": 0.0, "epoch": 0.32397075798383995, "flos": 807523250688.0, "grad_norm": 0.06656486310868524, "language_loss": 0.8795855, "learning_rate": 0.0007899976933676986, "loss": 0.89056444, "num_input_tokens_seen": 139333328, "router_z_loss_mlp": 0.13140869, "routerloss_mlp": 0.0, "step": 1684, "time_per_iteration": 2.967172622680664 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092249, "balance_loss_mlp": 1.07880259, "diversity_loss_mlp": 0.0, "epoch": 0.3241631396691035, "flos": 601689005568.0, "grad_norm": 0.09628316614228749, "language_loss": 0.87045735, "learning_rate": 0.0007897438485691955, "loss": 0.88137984, "num_input_tokens_seen": 139400976, "router_z_loss_mlp": 0.13464355, "routerloss_mlp": 0.0, "step": 1685, "time_per_iteration": 2.680147171020508 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103099, "balance_loss_mlp": 1.0898304, "diversity_loss_mlp": 0.0, "epoch": 0.32435552135436707, "flos": 474219297792.0, "grad_norm": 0.0850736326825917, "language_loss": 0.82684374, "learning_rate": 0.0007894898912821542, "loss": 0.83787471, "num_input_tokens_seen": 139465664, "router_z_loss_mlp": 0.13293457, "routerloss_mlp": 0.0, "step": 1686, "time_per_iteration": 2.554380416870117 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101582, "balance_loss_mlp": 1.0880518, "diversity_loss_mlp": 0.0, "epoch": 0.3245479030396306, "flos": 538102268928.0, "grad_norm": 0.06056792299191916, "language_loss": 0.86695451, "learning_rate": 0.0007892358216051695, "loss": 0.87797034, "num_input_tokens_seen": 139541984, "router_z_loss_mlp": 0.13537598, "routerloss_mlp": 0.0, "step": 1687, "time_per_iteration": 2.7851648330688477 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01109888, "balance_loss_mlp": 1.09641767, "diversity_loss_mlp": 0.0, "epoch": 0.3247402847248942, "flos": 547654597632.0, "grad_norm": 0.07434076211008771, "language_loss": 0.91829026, "learning_rate": 0.0007889816396368803, "loss": 0.92938912, "num_input_tokens_seen": 139607408, "router_z_loss_mlp": 0.13476562, "routerloss_mlp": 0.0, "step": 1688, "time_per_iteration": 2.6211581230163574 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111829, "balance_loss_mlp": 1.10499799, "diversity_loss_mlp": 0.0, "epoch": 0.3249326664101578, "flos": 378151276032.0, "grad_norm": 0.07845440141588131, "language_loss": 0.85253429, "learning_rate": 0.0007887273454759687, "loss": 0.8637172, "num_input_tokens_seen": 139670000, "router_z_loss_mlp": 0.13299561, "routerloss_mlp": 0.0, "step": 1689, "time_per_iteration": 2.507779598236084 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01122219, "balance_loss_mlp": 1.10946417, "diversity_loss_mlp": 0.0, "epoch": 0.3251250480954213, "flos": 528078237696.0, "grad_norm": 0.08373410695529686, "language_loss": 0.82792354, "learning_rate": 0.0007884729392211603, "loss": 0.83914578, "num_input_tokens_seen": 139739872, "router_z_loss_mlp": 0.12768555, "routerloss_mlp": 0.0, "step": 1690, "time_per_iteration": 2.6805906295776367 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119249, "balance_loss_mlp": 1.10672641, "diversity_loss_mlp": 0.0, "epoch": 0.3253174297806849, "flos": 449659372032.0, "grad_norm": 0.09069843341009556, "language_loss": 0.85648167, "learning_rate": 0.0007882184209712245, "loss": 0.86767411, "num_input_tokens_seen": 139802032, "router_z_loss_mlp": 0.12530518, "routerloss_mlp": 0.0, "step": 1691, "time_per_iteration": 2.569239377975464 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00949982, "balance_loss_mlp": 1.66309059, "diversity_loss_mlp": 0.20491584, "epoch": 0.32550981146594843, "flos": 704181040128.0, "grad_norm": 0.028395749586794427, "language_loss": 0.85757548, "learning_rate": 0.000787963790824974, "loss": 0.86707526, "num_input_tokens_seen": 139885648, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01597837, "step": 1692, "time_per_iteration": 3.009209156036377 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113225, "balance_loss_mlp": 1.10071397, "diversity_loss_mlp": 0.0, "epoch": 0.325702193151212, "flos": 392704422912.0, "grad_norm": 0.22846677162281695, "language_loss": 0.89612615, "learning_rate": 0.0007877090488812651, "loss": 0.90725839, "num_input_tokens_seen": 139947920, "router_z_loss_mlp": 0.12512207, "routerloss_mlp": 0.0, "step": 1693, "time_per_iteration": 2.450209617614746 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00936753, "balance_loss_mlp": 1.63723278, "diversity_loss_mlp": 0.20419246, "epoch": 0.32589457483647555, "flos": 577494696960.0, "grad_norm": 0.03161007726798549, "language_loss": 0.83743423, "learning_rate": 0.0007874541952389973, "loss": 0.84680176, "num_input_tokens_seen": 140020048, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01604037, "step": 1694, "time_per_iteration": 2.6965737342834473 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111615, "balance_loss_mlp": 1.10350823, "diversity_loss_mlp": 0.0, "epoch": 0.32608695652173914, "flos": 498339454464.0, "grad_norm": 0.07424213060006848, "language_loss": 0.86538494, "learning_rate": 0.0007871992299971136, "loss": 0.87654638, "num_input_tokens_seen": 140085600, "router_z_loss_mlp": 0.12652588, "routerloss_mlp": 0.0, "step": 1695, "time_per_iteration": 2.570406913757324 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01131966, "balance_loss_mlp": 1.11953878, "diversity_loss_mlp": 0.0, "epoch": 0.32627933820700267, "flos": 591145910784.0, "grad_norm": 0.0612219868328418, "language_loss": 0.84142137, "learning_rate": 0.0007869441532546001, "loss": 0.852741, "num_input_tokens_seen": 140155152, "router_z_loss_mlp": 0.12432861, "routerloss_mlp": 0.0, "step": 1696, "time_per_iteration": 2.763688087463379 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128533, "balance_loss_mlp": 1.11626601, "diversity_loss_mlp": 0.0, "epoch": 0.32647171989226625, "flos": 609086532096.0, "grad_norm": 0.06155756648422996, "language_loss": 0.79298395, "learning_rate": 0.0007866889651104867, "loss": 0.80426925, "num_input_tokens_seen": 140228560, "router_z_loss_mlp": 0.12255859, "routerloss_mlp": 0.0, "step": 1697, "time_per_iteration": 2.816236972808838 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130866, "balance_loss_mlp": 1.11769366, "diversity_loss_mlp": 0.0, "epoch": 0.32666410157752984, "flos": 477154520064.0, "grad_norm": 0.0827611554210385, "language_loss": 0.83172429, "learning_rate": 0.000786433665663846, "loss": 0.84303296, "num_input_tokens_seen": 140297952, "router_z_loss_mlp": 0.13195801, "routerloss_mlp": 0.0, "step": 1698, "time_per_iteration": 2.6627049446105957 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135283, "balance_loss_mlp": 1.12240815, "diversity_loss_mlp": 0.0, "epoch": 0.3268564832627934, "flos": 718385822208.0, "grad_norm": 0.08562611300573084, "language_loss": 0.86256903, "learning_rate": 0.0007861782550137942, "loss": 0.87392187, "num_input_tokens_seen": 140373408, "router_z_loss_mlp": 0.12884521, "routerloss_mlp": 0.0, "step": 1699, "time_per_iteration": 2.9298973083496094 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115677, "balance_loss_mlp": 1.10270739, "diversity_loss_mlp": 0.0, "epoch": 0.32704886494805696, "flos": 768795618816.0, "grad_norm": 0.06870341741306431, "language_loss": 0.85913056, "learning_rate": 0.0007859227332594901, "loss": 0.8702873, "num_input_tokens_seen": 140451840, "router_z_loss_mlp": 0.12988281, "routerloss_mlp": 0.0, "step": 1700, "time_per_iteration": 2.9108214378356934 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099921, "balance_loss_mlp": 1.08703494, "diversity_loss_mlp": 0.0, "epoch": 0.3272412466333205, "flos": 849912569856.0, "grad_norm": 0.08010897822069696, "language_loss": 0.84705722, "learning_rate": 0.0007856671005001365, "loss": 0.85805643, "num_input_tokens_seen": 140537696, "router_z_loss_mlp": 0.12884521, "routerloss_mlp": 0.0, "step": 1701, "time_per_iteration": 3.172921895980835 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088126, "balance_loss_mlp": 1.07506084, "diversity_loss_mlp": 0.0, "epoch": 0.3274336283185841, "flos": 831586507776.0, "grad_norm": 0.0963591610521261, "language_loss": 0.81720912, "learning_rate": 0.0007854113568349787, "loss": 0.82809043, "num_input_tokens_seen": 140623536, "router_z_loss_mlp": 0.13085938, "routerloss_mlp": 0.0, "step": 1702, "time_per_iteration": 3.1135685443878174 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100269, "balance_loss_mlp": 1.08686948, "diversity_loss_mlp": 0.0, "epoch": 0.3276260100038476, "flos": 692027172864.0, "grad_norm": 0.07838750037803571, "language_loss": 0.80661154, "learning_rate": 0.0007851555023633052, "loss": 0.8176142, "num_input_tokens_seen": 140700688, "router_z_loss_mlp": 0.13397217, "routerloss_mlp": 0.0, "step": 1703, "time_per_iteration": 2.841059684753418 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01086421, "balance_loss_mlp": 1.07271171, "diversity_loss_mlp": 0.0, "epoch": 0.3278183916891112, "flos": 436059915264.0, "grad_norm": 0.07047077484334266, "language_loss": 0.82222247, "learning_rate": 0.0007848995371844474, "loss": 0.83308667, "num_input_tokens_seen": 140765808, "router_z_loss_mlp": 0.13720703, "routerloss_mlp": 0.0, "step": 1704, "time_per_iteration": 2.515455961227417 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094981, "balance_loss_mlp": 1.0816896, "diversity_loss_mlp": 0.0, "epoch": 0.3280107733743748, "flos": 461109169152.0, "grad_norm": 0.08203255389116743, "language_loss": 0.80260348, "learning_rate": 0.0007846434613977801, "loss": 0.81355333, "num_input_tokens_seen": 140830512, "router_z_loss_mlp": 0.13305664, "routerloss_mlp": 0.0, "step": 1705, "time_per_iteration": 2.523026466369629 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100095, "balance_loss_mlp": 1.0868392, "diversity_loss_mlp": 0.0, "epoch": 0.3282031550596383, "flos": 679319737344.0, "grad_norm": 0.07270926258732689, "language_loss": 0.78603041, "learning_rate": 0.0007843872751027203, "loss": 0.7970314, "num_input_tokens_seen": 140902816, "router_z_loss_mlp": 0.13275146, "routerloss_mlp": 0.0, "step": 1706, "time_per_iteration": 2.8923709392547607 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00915397, "balance_loss_mlp": 1.59612775, "diversity_loss_mlp": 0.20258766, "epoch": 0.3283955367449019, "flos": 545107014144.0, "grad_norm": 0.02966318853366187, "language_loss": 0.87305748, "learning_rate": 0.0007841309783987287, "loss": 0.88221151, "num_input_tokens_seen": 140975488, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01603885, "step": 1707, "time_per_iteration": 2.7517144680023193 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115655, "balance_loss_mlp": 1.10263109, "diversity_loss_mlp": 0.0, "epoch": 0.32858791843016544, "flos": 481261118976.0, "grad_norm": 0.06500174516261728, "language_loss": 0.89240694, "learning_rate": 0.0007838745713853084, "loss": 0.9035635, "num_input_tokens_seen": 141043248, "router_z_loss_mlp": 0.13031006, "routerloss_mlp": 0.0, "step": 1708, "time_per_iteration": 2.6181201934814453 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01122322, "balance_loss_mlp": 1.10945296, "diversity_loss_mlp": 0.0, "epoch": 0.328780300115429, "flos": 566805869568.0, "grad_norm": 0.06936064314807153, "language_loss": 0.8434307, "learning_rate": 0.0007836180541620053, "loss": 0.85465395, "num_input_tokens_seen": 141119408, "router_z_loss_mlp": 0.12866211, "routerloss_mlp": 0.0, "step": 1709, "time_per_iteration": 2.7040350437164307 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124287, "balance_loss_mlp": 1.1112572, "diversity_loss_mlp": 0.0, "epoch": 0.32897268180069256, "flos": 476027933184.0, "grad_norm": 0.06883588356672955, "language_loss": 0.86454904, "learning_rate": 0.0007833614268284082, "loss": 0.87579191, "num_input_tokens_seen": 141184112, "router_z_loss_mlp": 0.13043213, "routerloss_mlp": 0.0, "step": 1710, "time_per_iteration": 2.5110740661621094 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01425821, "balance_loss_mlp": 1.41738081, "diversity_loss_mlp": 0.0, "epoch": 0.32916506348595614, "flos": 1577301548544.0, "grad_norm": 0.1402114647579648, "language_loss": 0.74109769, "learning_rate": 0.0007831046894841489, "loss": 0.75535595, "num_input_tokens_seen": 141414960, "router_z_loss_mlp": 0.08447266, "routerloss_mlp": 0.0, "step": 1711, "time_per_iteration": 4.873327016830444 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01129416, "balance_loss_mlp": 1.11650598, "diversity_loss_mlp": 0.0, "epoch": 0.3293574451712197, "flos": 482886945792.0, "grad_norm": 0.0798208466882041, "language_loss": 0.78414649, "learning_rate": 0.0007828478422289016, "loss": 0.79544067, "num_input_tokens_seen": 141485744, "router_z_loss_mlp": 0.12927246, "routerloss_mlp": 0.0, "step": 1712, "time_per_iteration": 2.608412027359009 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138403, "balance_loss_mlp": 1.12507582, "diversity_loss_mlp": 0.0, "epoch": 0.32954982685648326, "flos": 622557508608.0, "grad_norm": 0.07544776571140048, "language_loss": 0.8909815, "learning_rate": 0.0007825908851623833, "loss": 0.90236557, "num_input_tokens_seen": 141560592, "router_z_loss_mlp": 0.13323975, "routerloss_mlp": 0.0, "step": 1713, "time_per_iteration": 2.8033607006073 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01134961, "balance_loss_mlp": 1.12190771, "diversity_loss_mlp": 0.0, "epoch": 0.32974220854174685, "flos": 544971193344.0, "grad_norm": 0.06974595077498419, "language_loss": 0.85003847, "learning_rate": 0.0007823338183843533, "loss": 0.86138809, "num_input_tokens_seen": 141630400, "router_z_loss_mlp": 0.1307373, "routerloss_mlp": 0.0, "step": 1714, "time_per_iteration": 2.6861188411712646 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01148942, "balance_loss_mlp": 1.13610959, "diversity_loss_mlp": 0.0, "epoch": 0.3299345902270104, "flos": 982155870720.0, "grad_norm": 0.07049806127627434, "language_loss": 0.81025606, "learning_rate": 0.0007820766419946141, "loss": 0.82174551, "num_input_tokens_seen": 141721552, "router_z_loss_mlp": 0.1282959, "routerloss_mlp": 0.0, "step": 1715, "time_per_iteration": 3.3007164001464844 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01168148, "balance_loss_mlp": 1.16008925, "diversity_loss_mlp": 0.0, "epoch": 0.33012697191227397, "flos": 1403664090624.0, "grad_norm": 0.052131774928428895, "language_loss": 0.7967248, "learning_rate": 0.0007818193560930102, "loss": 0.80840629, "num_input_tokens_seen": 141956464, "router_z_loss_mlp": 0.08056641, "routerloss_mlp": 0.0, "step": 1716, "time_per_iteration": 4.947760105133057 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00906852, "balance_loss_mlp": 1.58163857, "diversity_loss_mlp": 0.20079982, "epoch": 0.3303193535975375, "flos": 505151479296.0, "grad_norm": 0.033697214377685164, "language_loss": 0.75853068, "learning_rate": 0.0007815619607794288, "loss": 0.76759923, "num_input_tokens_seen": 142029552, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01563331, "step": 1717, "time_per_iteration": 2.689937114715576 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01173062, "balance_loss_mlp": 1.1601274, "diversity_loss_mlp": 0.0, "epoch": 0.3305117352828011, "flos": 937977739776.0, "grad_norm": 0.09689448967864323, "language_loss": 0.8294118, "learning_rate": 0.0007813044561538001, "loss": 0.84114236, "num_input_tokens_seen": 142117344, "router_z_loss_mlp": 0.12945557, "routerloss_mlp": 0.0, "step": 1718, "time_per_iteration": 3.1421005725860596 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01158875, "balance_loss_mlp": 1.14559531, "diversity_loss_mlp": 0.0, "epoch": 0.3307041169680646, "flos": 721499083776.0, "grad_norm": 0.06842928932014077, "language_loss": 0.88578129, "learning_rate": 0.0007810468423160958, "loss": 0.89736998, "num_input_tokens_seen": 142190096, "router_z_loss_mlp": 0.13293457, "routerloss_mlp": 0.0, "step": 1719, "time_per_iteration": 2.8917293548583984 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01157511, "balance_loss_mlp": 1.14486265, "diversity_loss_mlp": 0.0, "epoch": 0.3308964986533282, "flos": 583614761472.0, "grad_norm": 0.06941390463820386, "language_loss": 0.81896281, "learning_rate": 0.0007807891193663306, "loss": 0.83053792, "num_input_tokens_seen": 142265584, "router_z_loss_mlp": 0.12640381, "routerloss_mlp": 0.0, "step": 1720, "time_per_iteration": 2.8352882862091064 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01141979, "balance_loss_mlp": 1.12950385, "diversity_loss_mlp": 0.0, "epoch": 0.33108888033859174, "flos": 473576896512.0, "grad_norm": 0.07961809028947962, "language_loss": 0.82409328, "learning_rate": 0.0007805312874045614, "loss": 0.83551311, "num_input_tokens_seen": 142330352, "router_z_loss_mlp": 0.12481689, "routerloss_mlp": 0.0, "step": 1721, "time_per_iteration": 2.5056259632110596 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01137994, "balance_loss_mlp": 1.12510777, "diversity_loss_mlp": 0.0, "epoch": 0.3312812620238553, "flos": 386129534976.0, "grad_norm": 0.09061115976682882, "language_loss": 0.86960506, "learning_rate": 0.0007802733465308874, "loss": 0.88098502, "num_input_tokens_seen": 142392208, "router_z_loss_mlp": 0.12896729, "routerloss_mlp": 0.0, "step": 1722, "time_per_iteration": 2.438533306121826 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01144898, "balance_loss_mlp": 1.13225603, "diversity_loss_mlp": 0.0, "epoch": 0.3314736437091189, "flos": 494554056192.0, "grad_norm": 0.06773749819611302, "language_loss": 0.84162688, "learning_rate": 0.0007800152968454501, "loss": 0.8530758, "num_input_tokens_seen": 142462112, "router_z_loss_mlp": 0.12652588, "routerloss_mlp": 0.0, "step": 1723, "time_per_iteration": 2.6364991664886475 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01134044, "balance_loss_mlp": 1.12146711, "diversity_loss_mlp": 0.0, "epoch": 0.33166602539438245, "flos": 653662586880.0, "grad_norm": 0.06044198445597461, "language_loss": 0.90330362, "learning_rate": 0.0007797571384484334, "loss": 0.91464406, "num_input_tokens_seen": 142539120, "router_z_loss_mlp": 0.12567139, "routerloss_mlp": 0.0, "step": 1724, "time_per_iteration": 2.8638265132904053 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01133346, "balance_loss_mlp": 1.12061453, "diversity_loss_mlp": 0.0, "epoch": 0.33185840707964603, "flos": 520806620160.0, "grad_norm": 0.0752969909322094, "language_loss": 0.91929704, "learning_rate": 0.0007794988714400633, "loss": 0.93063056, "num_input_tokens_seen": 142611520, "router_z_loss_mlp": 0.12744141, "routerloss_mlp": 0.0, "step": 1725, "time_per_iteration": 2.615788698196411 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01125798, "balance_loss_mlp": 1.11242867, "diversity_loss_mlp": 0.0, "epoch": 0.33205078876490957, "flos": 436949365248.0, "grad_norm": 0.07890733478173245, "language_loss": 0.85302055, "learning_rate": 0.0007792404959206079, "loss": 0.86427855, "num_input_tokens_seen": 142676064, "router_z_loss_mlp": 0.13372803, "routerloss_mlp": 0.0, "step": 1726, "time_per_iteration": 2.545780897140503 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01107165, "balance_loss_mlp": 1.09446895, "diversity_loss_mlp": 0.0, "epoch": 0.33224317045017315, "flos": 768738719232.0, "grad_norm": 0.07756389475354548, "language_loss": 0.81480336, "learning_rate": 0.0007789820119903774, "loss": 0.82587504, "num_input_tokens_seen": 142750944, "router_z_loss_mlp": 0.12689209, "routerloss_mlp": 0.0, "step": 1727, "time_per_iteration": 3.005662441253662 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01114992, "balance_loss_mlp": 1.10335684, "diversity_loss_mlp": 0.0, "epoch": 0.3324355521354367, "flos": 1466381574144.0, "grad_norm": 0.03748312413261812, "language_loss": 0.78492665, "learning_rate": 0.0007787234197497242, "loss": 0.7960766, "num_input_tokens_seen": 142974032, "router_z_loss_mlp": 0.11621094, "routerloss_mlp": 0.0, "step": 1728, "time_per_iteration": 4.833205223083496 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105872, "balance_loss_mlp": 1.09285486, "diversity_loss_mlp": 0.0, "epoch": 0.3326279338207003, "flos": 496691232768.0, "grad_norm": 0.07170574552345628, "language_loss": 0.83970881, "learning_rate": 0.0007784647192990428, "loss": 0.85076749, "num_input_tokens_seen": 143047280, "router_z_loss_mlp": 0.13012695, "routerloss_mlp": 0.0, "step": 1729, "time_per_iteration": 2.7309772968292236 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01107979, "balance_loss_mlp": 1.0948776, "diversity_loss_mlp": 0.0, "epoch": 0.33282031550596386, "flos": 635890093056.0, "grad_norm": 0.06011930461286596, "language_loss": 0.80777055, "learning_rate": 0.0007782059107387696, "loss": 0.81885028, "num_input_tokens_seen": 143124224, "router_z_loss_mlp": 0.13116455, "routerloss_mlp": 0.0, "step": 1730, "time_per_iteration": 2.8615641593933105 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0113195, "balance_loss_mlp": 1.11733532, "diversity_loss_mlp": 0.0, "epoch": 0.3330126971912274, "flos": 689511896064.0, "grad_norm": 0.08106060743083753, "language_loss": 0.88617826, "learning_rate": 0.0007779469941693826, "loss": 0.89749771, "num_input_tokens_seen": 143194048, "router_z_loss_mlp": 0.1463623, "routerloss_mlp": 0.0, "step": 1731, "time_per_iteration": 2.801208257675171 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126142, "balance_loss_mlp": 1.11240935, "diversity_loss_mlp": 0.0, "epoch": 0.333205078876491, "flos": 566457504768.0, "grad_norm": 0.09519717038034853, "language_loss": 0.77091044, "learning_rate": 0.0007776879696914029, "loss": 0.78217185, "num_input_tokens_seen": 143272976, "router_z_loss_mlp": 0.13757324, "routerloss_mlp": 0.0, "step": 1732, "time_per_iteration": 2.8286595344543457 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01123068, "balance_loss_mlp": 1.10889435, "diversity_loss_mlp": 0.0, "epoch": 0.3333974605617545, "flos": 640927987200.0, "grad_norm": 0.05947539267688924, "language_loss": 0.88910627, "learning_rate": 0.000777428837405392, "loss": 0.90033698, "num_input_tokens_seen": 143346496, "router_z_loss_mlp": 0.14160156, "routerloss_mlp": 0.0, "step": 1733, "time_per_iteration": 2.8319156169891357 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121491, "balance_loss_mlp": 1.10701954, "diversity_loss_mlp": 0.0, "epoch": 0.3335898422470181, "flos": 461833062912.0, "grad_norm": 0.07113995025739508, "language_loss": 0.86735553, "learning_rate": 0.0007771695974119544, "loss": 0.87857044, "num_input_tokens_seen": 143410448, "router_z_loss_mlp": 0.14489746, "routerloss_mlp": 0.0, "step": 1734, "time_per_iteration": 2.5376570224761963 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112031, "balance_loss_mlp": 1.09795249, "diversity_loss_mlp": 0.0, "epoch": 0.33378222393228163, "flos": 852870187008.0, "grad_norm": 0.08734149249458338, "language_loss": 0.75937277, "learning_rate": 0.0007769102498117359, "loss": 0.77049315, "num_input_tokens_seen": 143492416, "router_z_loss_mlp": 0.14074707, "routerloss_mlp": 0.0, "step": 1735, "time_per_iteration": 3.093188524246216 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105698, "balance_loss_mlp": 1.09138131, "diversity_loss_mlp": 0.0, "epoch": 0.3339746056175452, "flos": 954665491968.0, "grad_norm": 0.06929562674350419, "language_loss": 0.79383999, "learning_rate": 0.000776650794705424, "loss": 0.80489695, "num_input_tokens_seen": 143590096, "router_z_loss_mlp": 0.14318848, "routerloss_mlp": 0.0, "step": 1736, "time_per_iteration": 3.253673791885376 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121685, "balance_loss_mlp": 1.10730791, "diversity_loss_mlp": 0.0, "epoch": 0.33416698730280875, "flos": 544825460736.0, "grad_norm": 0.06325878214231093, "language_loss": 0.82130396, "learning_rate": 0.0007763912321937483, "loss": 0.83252084, "num_input_tokens_seen": 143663344, "router_z_loss_mlp": 0.14367676, "routerloss_mlp": 0.0, "step": 1737, "time_per_iteration": 2.7109947204589844 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117751, "balance_loss_mlp": 1.10324299, "diversity_loss_mlp": 0.0, "epoch": 0.33435936898807234, "flos": 1014096070656.0, "grad_norm": 0.08404595709863052, "language_loss": 0.82403475, "learning_rate": 0.0007761315623774799, "loss": 0.83521223, "num_input_tokens_seen": 143753072, "router_z_loss_mlp": 0.14489746, "routerloss_mlp": 0.0, "step": 1738, "time_per_iteration": 3.4125657081604004 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01109047, "balance_loss_mlp": 1.0946703, "diversity_loss_mlp": 0.0, "epoch": 0.3345517506733359, "flos": 615221650944.0, "grad_norm": 0.08421865543081901, "language_loss": 0.87820536, "learning_rate": 0.0007758717853574313, "loss": 0.88929582, "num_input_tokens_seen": 143827280, "router_z_loss_mlp": 0.14367676, "routerloss_mlp": 0.0, "step": 1739, "time_per_iteration": 2.7345223426818848 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106099, "balance_loss_mlp": 1.09184134, "diversity_loss_mlp": 0.0, "epoch": 0.33474413235859946, "flos": 494593703424.0, "grad_norm": 0.07638673743764693, "language_loss": 0.90095574, "learning_rate": 0.0007756119012344571, "loss": 0.91201669, "num_input_tokens_seen": 143895072, "router_z_loss_mlp": 0.14257812, "routerloss_mlp": 0.0, "step": 1740, "time_per_iteration": 2.5901129245758057 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101021, "balance_loss_mlp": 1.08709717, "diversity_loss_mlp": 0.0, "epoch": 0.33493651404386304, "flos": 628381338624.0, "grad_norm": 0.06863708242027233, "language_loss": 0.8461023, "learning_rate": 0.0007753519101094535, "loss": 0.85711253, "num_input_tokens_seen": 143965728, "router_z_loss_mlp": 0.13928223, "routerloss_mlp": 0.0, "step": 1741, "time_per_iteration": 2.770315647125244 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089984, "balance_loss_mlp": 1.07595301, "diversity_loss_mlp": 0.0, "epoch": 0.3351288957291266, "flos": 513727723008.0, "grad_norm": 0.07992644583812669, "language_loss": 0.86363387, "learning_rate": 0.0007750918120833575, "loss": 0.87453371, "num_input_tokens_seen": 144030272, "router_z_loss_mlp": 0.14050293, "routerloss_mlp": 0.0, "step": 1742, "time_per_iteration": 2.58940052986145 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088465, "balance_loss_mlp": 1.07488728, "diversity_loss_mlp": 0.0, "epoch": 0.33532127741439016, "flos": 647302814208.0, "grad_norm": 0.11201991585260462, "language_loss": 0.87392128, "learning_rate": 0.0007748316072571485, "loss": 0.88480592, "num_input_tokens_seen": 144104048, "router_z_loss_mlp": 0.13586426, "routerloss_mlp": 0.0, "step": 1743, "time_per_iteration": 2.8557286262512207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01086266, "balance_loss_mlp": 1.07202053, "diversity_loss_mlp": 0.0, "epoch": 0.3355136590996537, "flos": 768464506368.0, "grad_norm": 0.0749416267225997, "language_loss": 0.79045737, "learning_rate": 0.0007745712957318467, "loss": 0.80131996, "num_input_tokens_seen": 144180432, "router_z_loss_mlp": 0.14233398, "routerloss_mlp": 0.0, "step": 1744, "time_per_iteration": 2.9912548065185547 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084233, "balance_loss_mlp": 1.07057166, "diversity_loss_mlp": 0.0, "epoch": 0.3357060407849173, "flos": 595536634368.0, "grad_norm": 0.06946859722884112, "language_loss": 0.86471289, "learning_rate": 0.0007743108776085141, "loss": 0.87555522, "num_input_tokens_seen": 144258704, "router_z_loss_mlp": 0.13684082, "routerloss_mlp": 0.0, "step": 1745, "time_per_iteration": 2.7899224758148193 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084466, "balance_loss_mlp": 1.07023191, "diversity_loss_mlp": 0.0, "epoch": 0.3358984224701808, "flos": 598590425088.0, "grad_norm": 0.08256839233284315, "language_loss": 0.82965624, "learning_rate": 0.0007740503529882543, "loss": 0.84050083, "num_input_tokens_seen": 144335104, "router_z_loss_mlp": 0.14233398, "routerloss_mlp": 0.0, "step": 1746, "time_per_iteration": 2.808084011077881 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084564, "balance_loss_mlp": 1.07044971, "diversity_loss_mlp": 0.0, "epoch": 0.3360908041554444, "flos": 578329818624.0, "grad_norm": 0.07349682427851349, "language_loss": 0.90707254, "learning_rate": 0.0007737897219722114, "loss": 0.91791821, "num_input_tokens_seen": 144402912, "router_z_loss_mlp": 0.14111328, "routerloss_mlp": 0.0, "step": 1747, "time_per_iteration": 2.712833881378174 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092286, "balance_loss_mlp": 1.07794499, "diversity_loss_mlp": 0.0, "epoch": 0.336283185840708, "flos": 513589330944.0, "grad_norm": 0.05794758251669461, "language_loss": 0.81094921, "learning_rate": 0.0007735289846615716, "loss": 0.82187206, "num_input_tokens_seen": 144475328, "router_z_loss_mlp": 0.14343262, "routerloss_mlp": 0.0, "step": 1748, "time_per_iteration": 2.677976369857788 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108166, "balance_loss_mlp": 1.09457588, "diversity_loss_mlp": 0.0, "epoch": 0.3364755675259715, "flos": 524974887936.0, "grad_norm": 0.0827866783592608, "language_loss": 0.823035, "learning_rate": 0.0007732681411575621, "loss": 0.8341167, "num_input_tokens_seen": 144548288, "router_z_loss_mlp": 0.13586426, "routerloss_mlp": 0.0, "step": 1749, "time_per_iteration": 2.674349069595337 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01114062, "balance_loss_mlp": 1.09997165, "diversity_loss_mlp": 0.0, "epoch": 0.3366679492112351, "flos": 554869315584.0, "grad_norm": 0.4203922337067485, "language_loss": 0.87328398, "learning_rate": 0.0007730071915614514, "loss": 0.88442457, "num_input_tokens_seen": 144619488, "router_z_loss_mlp": 0.14086914, "routerloss_mlp": 0.0, "step": 1750, "time_per_iteration": 2.6714634895324707 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113648, "balance_loss_mlp": 1.10037947, "diversity_loss_mlp": 0.0, "epoch": 0.33686033089649864, "flos": 427273698816.0, "grad_norm": 0.09571011442330926, "language_loss": 0.88792437, "learning_rate": 0.0007727461359745489, "loss": 0.89906085, "num_input_tokens_seen": 144682560, "router_z_loss_mlp": 0.13293457, "routerloss_mlp": 0.0, "step": 1751, "time_per_iteration": 2.469905376434326 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01141755, "balance_loss_mlp": 1.12897623, "diversity_loss_mlp": 0.0, "epoch": 0.3370527125817622, "flos": 541729451520.0, "grad_norm": 0.07412184794878955, "language_loss": 0.85941112, "learning_rate": 0.0007724849744982056, "loss": 0.87082875, "num_input_tokens_seen": 144753328, "router_z_loss_mlp": 0.12792969, "routerloss_mlp": 0.0, "step": 1752, "time_per_iteration": 2.6805977821350098 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0117715, "balance_loss_mlp": 1.16388226, "diversity_loss_mlp": 0.0, "epoch": 0.33724509426702576, "flos": 542114892288.0, "grad_norm": 0.09378397224837084, "language_loss": 0.81843758, "learning_rate": 0.0007722237072338131, "loss": 0.83020908, "num_input_tokens_seen": 144827312, "router_z_loss_mlp": 0.1328125, "routerloss_mlp": 0.0, "step": 1753, "time_per_iteration": 2.7348344326019287 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01186311, "balance_loss_mlp": 1.17280459, "diversity_loss_mlp": 0.0, "epoch": 0.33743747595228935, "flos": 472796103168.0, "grad_norm": 0.1034159122014491, "language_loss": 0.85304463, "learning_rate": 0.0007719623342828046, "loss": 0.86490774, "num_input_tokens_seen": 144893488, "router_z_loss_mlp": 0.13537598, "routerloss_mlp": 0.0, "step": 1754, "time_per_iteration": 2.5181336402893066 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01202577, "balance_loss_mlp": 1.18872511, "diversity_loss_mlp": 0.0, "epoch": 0.33762985763755293, "flos": 469818662400.0, "grad_norm": 0.12703041648808322, "language_loss": 0.84088987, "learning_rate": 0.000771700855746654, "loss": 0.85291564, "num_input_tokens_seen": 144961152, "router_z_loss_mlp": 0.13867188, "routerloss_mlp": 0.0, "step": 1755, "time_per_iteration": 2.590925931930542 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01188345, "balance_loss_mlp": 1.1743381, "diversity_loss_mlp": 0.0, "epoch": 0.33782223932281646, "flos": 492251323392.0, "grad_norm": 0.06849832931784437, "language_loss": 0.88371092, "learning_rate": 0.0007714392717268763, "loss": 0.89559436, "num_input_tokens_seen": 145030576, "router_z_loss_mlp": 0.14013672, "routerloss_mlp": 0.0, "step": 1756, "time_per_iteration": 2.560246706008911 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01189305, "balance_loss_mlp": 1.17545295, "diversity_loss_mlp": 0.0, "epoch": 0.33801462100808005, "flos": 465064892928.0, "grad_norm": 0.09135673410225151, "language_loss": 0.8630141, "learning_rate": 0.0007711775823250273, "loss": 0.8749072, "num_input_tokens_seen": 145095648, "router_z_loss_mlp": 0.13867188, "routerloss_mlp": 0.0, "step": 1757, "time_per_iteration": 2.562939167022705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01194838, "balance_loss_mlp": 1.18069935, "diversity_loss_mlp": 0.0, "epoch": 0.3382070026933436, "flos": 795668189184.0, "grad_norm": 0.07414503329772545, "language_loss": 0.83081156, "learning_rate": 0.0007709157876427039, "loss": 0.84275991, "num_input_tokens_seen": 145181248, "router_z_loss_mlp": 0.14147949, "routerloss_mlp": 0.0, "step": 1758, "time_per_iteration": 3.0652947425842285 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01190916, "balance_loss_mlp": 1.17681408, "diversity_loss_mlp": 0.0, "epoch": 0.33839938437860717, "flos": 508430297088.0, "grad_norm": 0.06977999371164574, "language_loss": 0.85321373, "learning_rate": 0.0007706538877815439, "loss": 0.86512285, "num_input_tokens_seen": 145252944, "router_z_loss_mlp": 0.14111328, "routerloss_mlp": 0.0, "step": 1759, "time_per_iteration": 2.5949320793151855 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01202515, "balance_loss_mlp": 1.1888063, "diversity_loss_mlp": 0.0, "epoch": 0.3385917660638707, "flos": 484243329024.0, "grad_norm": 0.052908737395413206, "language_loss": 0.83029473, "learning_rate": 0.0007703918828432259, "loss": 0.84231991, "num_input_tokens_seen": 145323168, "router_z_loss_mlp": 0.13720703, "routerloss_mlp": 0.0, "step": 1760, "time_per_iteration": 2.6404576301574707 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01231589, "balance_loss_mlp": 1.21696198, "diversity_loss_mlp": 0.0, "epoch": 0.3387841477491343, "flos": 545339381760.0, "grad_norm": 0.11529749255982873, "language_loss": 0.89274669, "learning_rate": 0.000770129772929469, "loss": 0.90506256, "num_input_tokens_seen": 145395776, "router_z_loss_mlp": 0.14611816, "routerloss_mlp": 0.0, "step": 1761, "time_per_iteration": 2.6486427783966064 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01212596, "balance_loss_mlp": 1.19812357, "diversity_loss_mlp": 0.0, "epoch": 0.3389765294343978, "flos": 719801676288.0, "grad_norm": 0.10010821715075297, "language_loss": 0.8820551, "learning_rate": 0.0007698675581420334, "loss": 0.89418107, "num_input_tokens_seen": 145470576, "router_z_loss_mlp": 0.14453125, "routerloss_mlp": 0.0, "step": 1762, "time_per_iteration": 2.8473589420318604 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01170537, "balance_loss_mlp": 1.15610099, "diversity_loss_mlp": 0.0, "epoch": 0.3391689111196614, "flos": 699928708608.0, "grad_norm": 0.06768336788468338, "language_loss": 0.79040444, "learning_rate": 0.0007696052385827199, "loss": 0.80210984, "num_input_tokens_seen": 145548896, "router_z_loss_mlp": 0.14440918, "routerloss_mlp": 0.0, "step": 1763, "time_per_iteration": 2.9893951416015625 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01147034, "balance_loss_mlp": 1.13271689, "diversity_loss_mlp": 0.0, "epoch": 0.339361292804925, "flos": 627093964800.0, "grad_norm": 0.06731413775333611, "language_loss": 0.78161937, "learning_rate": 0.00076934281435337, "loss": 0.79308975, "num_input_tokens_seen": 145617136, "router_z_loss_mlp": 0.14318848, "routerloss_mlp": 0.0, "step": 1764, "time_per_iteration": 2.7329161167144775 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00933074, "balance_loss_mlp": 1.62411106, "diversity_loss_mlp": 0.20785357, "epoch": 0.33955367449018853, "flos": 609600453120.0, "grad_norm": 0.0341650984642099, "language_loss": 0.86205357, "learning_rate": 0.0007690802855558658, "loss": 0.87138426, "num_input_tokens_seen": 145696416, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0170921, "step": 1765, "time_per_iteration": 2.9281163215637207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121638, "balance_loss_mlp": 1.10924029, "diversity_loss_mlp": 0.0, "epoch": 0.3397460561754521, "flos": 1453310346240.0, "grad_norm": 0.029090002598214117, "language_loss": 0.76374954, "learning_rate": 0.0007688176522921302, "loss": 0.77496594, "num_input_tokens_seen": 145919680, "router_z_loss_mlp": 0.12353516, "routerloss_mlp": 0.0, "step": 1766, "time_per_iteration": 4.91774320602417 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01104609, "balance_loss_mlp": 1.08886182, "diversity_loss_mlp": 0.0, "epoch": 0.33993843786071565, "flos": 487312174080.0, "grad_norm": 0.08396151855964885, "language_loss": 0.89357018, "learning_rate": 0.0007685549146641262, "loss": 0.90461624, "num_input_tokens_seen": 145984272, "router_z_loss_mlp": 0.15734863, "routerloss_mlp": 0.0, "step": 1767, "time_per_iteration": 2.5867435932159424 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108303, "balance_loss_mlp": 1.093521, "diversity_loss_mlp": 0.0, "epoch": 0.34013081954597923, "flos": 417338500608.0, "grad_norm": 0.10736891621188589, "language_loss": 0.8816734, "learning_rate": 0.0007682920727738579, "loss": 0.89275646, "num_input_tokens_seen": 146047248, "router_z_loss_mlp": 0.14782715, "routerloss_mlp": 0.0, "step": 1768, "time_per_iteration": 2.5119268894195557 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01102653, "balance_loss_mlp": 1.08738232, "diversity_loss_mlp": 0.0, "epoch": 0.34032320123124277, "flos": 437520185856.0, "grad_norm": 0.10494960168224592, "language_loss": 0.85048056, "learning_rate": 0.000768029126723369, "loss": 0.86150718, "num_input_tokens_seen": 146111872, "router_z_loss_mlp": 0.15246582, "routerloss_mlp": 0.0, "step": 1769, "time_per_iteration": 2.495424270629883 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090257, "balance_loss_mlp": 1.07520068, "diversity_loss_mlp": 0.0, "epoch": 0.34051558291650635, "flos": 457590643200.0, "grad_norm": 0.08686425564719477, "language_loss": 0.82128584, "learning_rate": 0.0007677660766147447, "loss": 0.83218843, "num_input_tokens_seen": 146172608, "router_z_loss_mlp": 0.15039062, "routerloss_mlp": 0.0, "step": 1770, "time_per_iteration": 2.532904624938965 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066946, "balance_loss_mlp": 1.05578792, "diversity_loss_mlp": 0.0, "epoch": 0.3407079646017699, "flos": 1558849204224.0, "grad_norm": 0.023964921008177247, "language_loss": 0.72470945, "learning_rate": 0.0007675029225501102, "loss": 0.73537892, "num_input_tokens_seen": 146413584, "router_z_loss_mlp": 0.11181641, "routerloss_mlp": 0.0, "step": 1771, "time_per_iteration": 4.944117784500122 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117931, "balance_loss_mlp": 1.1034112, "diversity_loss_mlp": 0.0, "epoch": 0.3409003462870335, "flos": 492555271680.0, "grad_norm": 0.10616133846526872, "language_loss": 0.795196, "learning_rate": 0.0007672396646316306, "loss": 0.80637527, "num_input_tokens_seen": 146476992, "router_z_loss_mlp": 0.1451416, "routerloss_mlp": 0.0, "step": 1772, "time_per_iteration": 2.6089062690734863 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01134399, "balance_loss_mlp": 1.11959314, "diversity_loss_mlp": 0.0, "epoch": 0.34109272797229706, "flos": 808479512064.0, "grad_norm": 0.07513330183645242, "language_loss": 0.80376065, "learning_rate": 0.000766976302961512, "loss": 0.8151046, "num_input_tokens_seen": 146552848, "router_z_loss_mlp": 0.14782715, "routerloss_mlp": 0.0, "step": 1773, "time_per_iteration": 3.042421340942383 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01158934, "balance_loss_mlp": 1.14410484, "diversity_loss_mlp": 0.0, "epoch": 0.3412851096575606, "flos": 470142434304.0, "grad_norm": 0.07872996810077096, "language_loss": 0.81390858, "learning_rate": 0.0007667128376420003, "loss": 0.82549793, "num_input_tokens_seen": 146617504, "router_z_loss_mlp": 0.14807129, "routerloss_mlp": 0.0, "step": 1774, "time_per_iteration": 2.536562442779541 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01208475, "balance_loss_mlp": 1.19358635, "diversity_loss_mlp": 0.0, "epoch": 0.3414774913428242, "flos": 595675026432.0, "grad_norm": 0.08297883362487203, "language_loss": 0.8462863, "learning_rate": 0.0007664492687753817, "loss": 0.85837102, "num_input_tokens_seen": 146691568, "router_z_loss_mlp": 0.14880371, "routerloss_mlp": 0.0, "step": 1775, "time_per_iteration": 2.6977102756500244 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01198612, "balance_loss_mlp": 1.18424678, "diversity_loss_mlp": 0.0, "epoch": 0.3416698730280877, "flos": 527463000576.0, "grad_norm": 0.10155126624771216, "language_loss": 0.81542516, "learning_rate": 0.000766185596463983, "loss": 0.82741123, "num_input_tokens_seen": 146764208, "router_z_loss_mlp": 0.14367676, "routerloss_mlp": 0.0, "step": 1776, "time_per_iteration": 2.6038215160369873 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01196202, "balance_loss_mlp": 1.18163514, "diversity_loss_mlp": 0.0, "epoch": 0.3418622547133513, "flos": 874640623104.0, "grad_norm": 0.0897891274607312, "language_loss": 0.77011722, "learning_rate": 0.0007659218208101706, "loss": 0.78207922, "num_input_tokens_seen": 146847744, "router_z_loss_mlp": 0.14550781, "routerloss_mlp": 0.0, "step": 1777, "time_per_iteration": 3.0933022499084473 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01173425, "balance_loss_mlp": 1.15902483, "diversity_loss_mlp": 0.0, "epoch": 0.34205463639861483, "flos": 603744689664.0, "grad_norm": 0.08364054831663822, "language_loss": 0.85122472, "learning_rate": 0.0007656579419163515, "loss": 0.86295897, "num_input_tokens_seen": 146918336, "router_z_loss_mlp": 0.1439209, "routerloss_mlp": 0.0, "step": 1778, "time_per_iteration": 2.732297420501709 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01146549, "balance_loss_mlp": 1.13211274, "diversity_loss_mlp": 0.0, "epoch": 0.3422470180838784, "flos": 463780090368.0, "grad_norm": 0.0722191895240348, "language_loss": 0.77409559, "learning_rate": 0.0007653939598849724, "loss": 0.78556108, "num_input_tokens_seen": 146982496, "router_z_loss_mlp": 0.14416504, "routerloss_mlp": 0.0, "step": 1779, "time_per_iteration": 2.4908664226531982 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01032648, "balance_loss_mlp": 1.02253902, "diversity_loss_mlp": 0.0, "epoch": 0.34243939976914195, "flos": 1586428416000.0, "grad_norm": 0.029240552967656448, "language_loss": 0.82880205, "learning_rate": 0.0007651298748185204, "loss": 0.83912855, "num_input_tokens_seen": 147213600, "router_z_loss_mlp": 0.10107422, "routerloss_mlp": 0.0, "step": 1780, "time_per_iteration": 4.9182775020599365 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121045, "balance_loss_mlp": 1.10688317, "diversity_loss_mlp": 0.0, "epoch": 0.34263178145440554, "flos": 873017367552.0, "grad_norm": 0.07624931845389674, "language_loss": 0.80176342, "learning_rate": 0.000764865686819522, "loss": 0.81297386, "num_input_tokens_seen": 147287664, "router_z_loss_mlp": 0.14160156, "routerloss_mlp": 0.0, "step": 1781, "time_per_iteration": 3.0602052211761475 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111853, "balance_loss_mlp": 1.097965, "diversity_loss_mlp": 0.0, "epoch": 0.3428241631396691, "flos": 506878622208.0, "grad_norm": 0.07936344533488468, "language_loss": 0.85836053, "learning_rate": 0.0007646013959905449, "loss": 0.86947906, "num_input_tokens_seen": 147356800, "router_z_loss_mlp": 0.13903809, "routerloss_mlp": 0.0, "step": 1782, "time_per_iteration": 2.5750925540924072 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01109071, "balance_loss_mlp": 1.09528995, "diversity_loss_mlp": 0.0, "epoch": 0.34301654482493266, "flos": 880039365120.0, "grad_norm": 0.07233814650781724, "language_loss": 0.81042612, "learning_rate": 0.0007643370024341949, "loss": 0.82151681, "num_input_tokens_seen": 147432496, "router_z_loss_mlp": 0.13806152, "routerloss_mlp": 0.0, "step": 1783, "time_per_iteration": 3.0870087146759033 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110812, "balance_loss_mlp": 1.09431553, "diversity_loss_mlp": 0.0, "epoch": 0.34320892651019624, "flos": 431763167232.0, "grad_norm": 0.07806584209391611, "language_loss": 0.83175099, "learning_rate": 0.0007640725062531195, "loss": 0.84283221, "num_input_tokens_seen": 147495856, "router_z_loss_mlp": 0.13818359, "routerloss_mlp": 0.0, "step": 1784, "time_per_iteration": 2.5063886642456055 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01102776, "balance_loss_mlp": 1.08888865, "diversity_loss_mlp": 0.0, "epoch": 0.3434013081954598, "flos": 463641698304.0, "grad_norm": 0.5067557182324087, "language_loss": 0.86699629, "learning_rate": 0.0007638079075500047, "loss": 0.87802398, "num_input_tokens_seen": 147559632, "router_z_loss_mlp": 0.13891602, "routerloss_mlp": 0.0, "step": 1785, "time_per_iteration": 2.532945394515991 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01015111, "balance_loss_mlp": 1.00562215, "diversity_loss_mlp": 0.0, "epoch": 0.34359368988072336, "flos": 1557332034048.0, "grad_norm": 0.016449027395748255, "language_loss": 0.75180668, "learning_rate": 0.0007635432064275772, "loss": 0.76195776, "num_input_tokens_seen": 147794576, "router_z_loss_mlp": 0.09472656, "routerloss_mlp": 0.0, "step": 1786, "time_per_iteration": 4.944318056106567 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01150049, "balance_loss_mlp": 1.13542247, "diversity_loss_mlp": 0.0, "epoch": 0.3437860715659869, "flos": 495527569920.0, "grad_norm": 0.07356798682381475, "language_loss": 0.83088338, "learning_rate": 0.0007632784029886026, "loss": 0.84238386, "num_input_tokens_seen": 147866960, "router_z_loss_mlp": 0.14599609, "routerloss_mlp": 0.0, "step": 1787, "time_per_iteration": 2.6217002868652344 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01204344, "balance_loss_mlp": 1.1884768, "diversity_loss_mlp": 0.0, "epoch": 0.3439784532512505, "flos": 718274594304.0, "grad_norm": 0.08799574205003287, "language_loss": 0.85466659, "learning_rate": 0.0007630134973358873, "loss": 0.86671007, "num_input_tokens_seen": 147947808, "router_z_loss_mlp": 0.15856934, "routerloss_mlp": 0.0, "step": 1788, "time_per_iteration": 2.9664394855499268 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01251833, "balance_loss_mlp": 1.2359066, "diversity_loss_mlp": 0.0, "epoch": 0.34417083493651407, "flos": 565862091264.0, "grad_norm": 0.1052875761358054, "language_loss": 0.86575854, "learning_rate": 0.0007627484895722763, "loss": 0.87827688, "num_input_tokens_seen": 148015936, "router_z_loss_mlp": 0.15917969, "routerloss_mlp": 0.0, "step": 1789, "time_per_iteration": 2.67280912399292 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01247407, "balance_loss_mlp": 1.23117065, "diversity_loss_mlp": 0.0, "epoch": 0.3443632166217776, "flos": 796330414080.0, "grad_norm": 0.09611070791328494, "language_loss": 0.80025196, "learning_rate": 0.0007624833798006552, "loss": 0.81272602, "num_input_tokens_seen": 148099776, "router_z_loss_mlp": 0.16235352, "routerloss_mlp": 0.0, "step": 1790, "time_per_iteration": 3.046809196472168 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01238128, "balance_loss_mlp": 1.22221315, "diversity_loss_mlp": 0.0, "epoch": 0.3445555983070412, "flos": 569313805824.0, "grad_norm": 0.07959093752215074, "language_loss": 0.83783114, "learning_rate": 0.0007622181681239483, "loss": 0.8502124, "num_input_tokens_seen": 148169616, "router_z_loss_mlp": 0.15905762, "routerloss_mlp": 0.0, "step": 1791, "time_per_iteration": 2.6601433753967285 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01244342, "balance_loss_mlp": 1.22793913, "diversity_loss_mlp": 0.0, "epoch": 0.3447479799923047, "flos": 568814565888.0, "grad_norm": 0.07919089267187412, "language_loss": 0.84668601, "learning_rate": 0.0007619528546451202, "loss": 0.85912943, "num_input_tokens_seen": 148247824, "router_z_loss_mlp": 0.1640625, "routerloss_mlp": 0.0, "step": 1792, "time_per_iteration": 2.782947063446045 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01208587, "balance_loss_mlp": 1.19314909, "diversity_loss_mlp": 0.0, "epoch": 0.3449403616775683, "flos": 967723863552.0, "grad_norm": 0.07332959959795217, "language_loss": 0.83832949, "learning_rate": 0.0007616874394671745, "loss": 0.85041535, "num_input_tokens_seen": 148333040, "router_z_loss_mlp": 0.1541748, "routerloss_mlp": 0.0, "step": 1793, "time_per_iteration": 3.3206703662872314 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01184994, "balance_loss_mlp": 1.169258, "diversity_loss_mlp": 0.0, "epoch": 0.34513274336283184, "flos": 568607164416.0, "grad_norm": 0.0713753042238581, "language_loss": 0.85051751, "learning_rate": 0.0007614219226931547, "loss": 0.86236751, "num_input_tokens_seen": 148401840, "router_z_loss_mlp": 0.15722656, "routerloss_mlp": 0.0, "step": 1794, "time_per_iteration": 2.7190396785736084 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01179587, "balance_loss_mlp": 1.16401851, "diversity_loss_mlp": 0.0, "epoch": 0.3453251250480954, "flos": 460943612928.0, "grad_norm": 0.07163818055438703, "language_loss": 0.8457973, "learning_rate": 0.0007611563044261435, "loss": 0.85759324, "num_input_tokens_seen": 148466576, "router_z_loss_mlp": 0.15551758, "routerloss_mlp": 0.0, "step": 1795, "time_per_iteration": 2.5077741146087646 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01150042, "balance_loss_mlp": 1.13422251, "diversity_loss_mlp": 0.0, "epoch": 0.34551750673335896, "flos": 415621269504.0, "grad_norm": 0.0670543853763616, "language_loss": 0.86376798, "learning_rate": 0.0007608905847692631, "loss": 0.8752684, "num_input_tokens_seen": 148530016, "router_z_loss_mlp": 0.15808105, "routerloss_mlp": 0.0, "step": 1796, "time_per_iteration": 2.4662768840789795 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112741, "balance_loss_mlp": 1.11171043, "diversity_loss_mlp": 0.0, "epoch": 0.34570988841862255, "flos": 587854609920.0, "grad_norm": 0.07671810253227593, "language_loss": 0.86553091, "learning_rate": 0.0007606247638256749, "loss": 0.87680501, "num_input_tokens_seen": 148610064, "router_z_loss_mlp": 0.15686035, "routerloss_mlp": 0.0, "step": 1797, "time_per_iteration": 2.8649494647979736 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00624206, "balance_loss_mlp": 1.05204535, "diversity_loss_mlp": 0.16984753, "epoch": 0.34590227010388613, "flos": 1567694518272.0, "grad_norm": 0.0016633519833830733, "language_loss": 0.78170294, "learning_rate": 0.0007603588416985798, "loss": 0.78794497, "num_input_tokens_seen": 148835872, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01325956, "step": 1798, "time_per_iteration": 4.963132619857788 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055666, "balance_loss_mlp": 1.04498482, "diversity_loss_mlp": 0.0, "epoch": 0.34609465178914967, "flos": 1537743564288.0, "grad_norm": 0.032920799461559694, "language_loss": 0.79327202, "learning_rate": 0.0007600928184912179, "loss": 0.80382872, "num_input_tokens_seen": 149066864, "router_z_loss_mlp": 0.10693359, "routerloss_mlp": 0.0, "step": 1799, "time_per_iteration": 4.773633003234863 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099713, "balance_loss_mlp": 1.08345306, "diversity_loss_mlp": 0.0, "epoch": 0.34628703347441325, "flos": 609363316224.0, "grad_norm": 0.10233507255995049, "language_loss": 0.85892332, "learning_rate": 0.0007598266943068686, "loss": 0.86992049, "num_input_tokens_seen": 149141600, "router_z_loss_mlp": 0.16259766, "routerloss_mlp": 0.0, "step": 1800, "time_per_iteration": 2.7380948066711426 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092311, "balance_loss_mlp": 1.0761466, "diversity_loss_mlp": 0.0, "epoch": 0.3464794151596768, "flos": 473319936000.0, "grad_norm": 0.08416075255699706, "language_loss": 0.83903629, "learning_rate": 0.0007595604692488507, "loss": 0.84995937, "num_input_tokens_seen": 149205888, "router_z_loss_mlp": 0.16162109, "routerloss_mlp": 0.0, "step": 1801, "time_per_iteration": 2.5558300018310547 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099836, "balance_loss_mlp": 1.08382583, "diversity_loss_mlp": 0.0, "epoch": 0.34667179684494037, "flos": 605681805312.0, "grad_norm": 0.0681721192963598, "language_loss": 0.82674247, "learning_rate": 0.0007592941434205215, "loss": 0.83774084, "num_input_tokens_seen": 149281280, "router_z_loss_mlp": 0.16003418, "routerloss_mlp": 0.0, "step": 1802, "time_per_iteration": 2.8181002140045166 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01017477, "balance_loss_mlp": 1.00651026, "diversity_loss_mlp": 0.0, "epoch": 0.3468641785302039, "flos": 1564912369152.0, "grad_norm": 0.018274165575771096, "language_loss": 0.73571062, "learning_rate": 0.0007590277169252782, "loss": 0.74588537, "num_input_tokens_seen": 149525008, "router_z_loss_mlp": 0.10986328, "routerloss_mlp": 0.0, "step": 1803, "time_per_iteration": 5.063629388809204 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126513, "balance_loss_mlp": 1.11121821, "diversity_loss_mlp": 0.0, "epoch": 0.3470565602154675, "flos": 907265442816.0, "grad_norm": 0.07342722091818694, "language_loss": 0.80217302, "learning_rate": 0.0007587611898665566, "loss": 0.81343818, "num_input_tokens_seen": 149600624, "router_z_loss_mlp": 0.15270996, "routerloss_mlp": 0.0, "step": 1804, "time_per_iteration": 3.0994317531585693 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0113982, "balance_loss_mlp": 1.12468028, "diversity_loss_mlp": 0.0, "epoch": 0.347248941900731, "flos": 638902038528.0, "grad_norm": 0.05936466476556785, "language_loss": 0.82130265, "learning_rate": 0.0007584945623478315, "loss": 0.83270085, "num_input_tokens_seen": 149674224, "router_z_loss_mlp": 0.15112305, "routerloss_mlp": 0.0, "step": 1805, "time_per_iteration": 2.833981513977051 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01152624, "balance_loss_mlp": 1.13780582, "diversity_loss_mlp": 0.0, "epoch": 0.3474413235859946, "flos": 847362788352.0, "grad_norm": 0.08744691316973383, "language_loss": 0.80801159, "learning_rate": 0.000758227834472617, "loss": 0.81953788, "num_input_tokens_seen": 149758688, "router_z_loss_mlp": 0.14807129, "routerloss_mlp": 0.0, "step": 1806, "time_per_iteration": 3.0535178184509277 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01166216, "balance_loss_mlp": 1.15111172, "diversity_loss_mlp": 0.0, "epoch": 0.3476337052712582, "flos": 515654926848.0, "grad_norm": 0.07500761638021176, "language_loss": 0.77729452, "learning_rate": 0.0007579610063444664, "loss": 0.7889567, "num_input_tokens_seen": 149831648, "router_z_loss_mlp": 0.15075684, "routerloss_mlp": 0.0, "step": 1807, "time_per_iteration": 2.7615864276885986 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01149194, "balance_loss_mlp": 1.1339947, "diversity_loss_mlp": 0.0, "epoch": 0.34782608695652173, "flos": 913551063552.0, "grad_norm": 0.07406875426876382, "language_loss": 0.87547183, "learning_rate": 0.0007576940780669712, "loss": 0.88696373, "num_input_tokens_seen": 149919440, "router_z_loss_mlp": 0.1517334, "routerloss_mlp": 0.0, "step": 1808, "time_per_iteration": 3.264080762863159 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01143539, "balance_loss_mlp": 1.12863731, "diversity_loss_mlp": 0.0, "epoch": 0.3480184686417853, "flos": 773714944512.0, "grad_norm": 0.07928472428244501, "language_loss": 0.84104979, "learning_rate": 0.0007574270497437624, "loss": 0.85248518, "num_input_tokens_seen": 150001632, "router_z_loss_mlp": 0.14880371, "routerloss_mlp": 0.0, "step": 1809, "time_per_iteration": 2.9859273433685303 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128376, "balance_loss_mlp": 1.11302221, "diversity_loss_mlp": 0.0, "epoch": 0.34821085032704885, "flos": 576839812608.0, "grad_norm": 0.07150597602774303, "language_loss": 0.88426095, "learning_rate": 0.000757159921478509, "loss": 0.89554477, "num_input_tokens_seen": 150077552, "router_z_loss_mlp": 0.15332031, "routerloss_mlp": 0.0, "step": 1810, "time_per_iteration": 2.7891488075256348 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057512, "balance_loss_mlp": 1.04754615, "diversity_loss_mlp": 0.0, "epoch": 0.34840323201231244, "flos": 1524947295744.0, "grad_norm": 0.03228641235871289, "language_loss": 0.74450636, "learning_rate": 0.0007568926933749201, "loss": 0.75508153, "num_input_tokens_seen": 150295328, "router_z_loss_mlp": 0.09960938, "routerloss_mlp": 0.0, "step": 1811, "time_per_iteration": 4.737962007522583 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103488, "balance_loss_mlp": 1.08814573, "diversity_loss_mlp": 0.0, "epoch": 0.34859561369757597, "flos": 509164102656.0, "grad_norm": 0.07438083858778873, "language_loss": 0.87798911, "learning_rate": 0.0007566253655367423, "loss": 0.88902402, "num_input_tokens_seen": 150360496, "router_z_loss_mlp": 0.15319824, "routerloss_mlp": 0.0, "step": 1812, "time_per_iteration": 2.5879476070404053 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091549, "balance_loss_mlp": 1.07600367, "diversity_loss_mlp": 0.0, "epoch": 0.34878799538283956, "flos": 548662616064.0, "grad_norm": 0.06854488097647142, "language_loss": 0.8957805, "learning_rate": 0.000756357938067762, "loss": 0.90669596, "num_input_tokens_seen": 150432064, "router_z_loss_mlp": 0.15527344, "routerloss_mlp": 0.0, "step": 1813, "time_per_iteration": 2.7090489864349365 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094415, "balance_loss_mlp": 1.07826209, "diversity_loss_mlp": 0.0, "epoch": 0.34898037706810314, "flos": 983638536192.0, "grad_norm": 0.0690606019510397, "language_loss": 0.8334865, "learning_rate": 0.0007560904110718033, "loss": 0.84443069, "num_input_tokens_seen": 150512176, "router_z_loss_mlp": 0.16149902, "routerloss_mlp": 0.0, "step": 1814, "time_per_iteration": 3.2445590496063232 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01096093, "balance_loss_mlp": 1.08003569, "diversity_loss_mlp": 0.0, "epoch": 0.3491727587533667, "flos": 681605217792.0, "grad_norm": 0.06223934742271703, "language_loss": 0.83650601, "learning_rate": 0.0007558227846527297, "loss": 0.84746695, "num_input_tokens_seen": 150586416, "router_z_loss_mlp": 0.16052246, "routerloss_mlp": 0.0, "step": 1815, "time_per_iteration": 2.8504550457000732 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110334, "balance_loss_mlp": 1.08731842, "diversity_loss_mlp": 0.0, "epoch": 0.34936514043863026, "flos": 394026301440.0, "grad_norm": 0.07831164241761415, "language_loss": 0.83117825, "learning_rate": 0.0007555550589144429, "loss": 0.84221166, "num_input_tokens_seen": 150648944, "router_z_loss_mlp": 0.16015625, "routerloss_mlp": 0.0, "step": 1816, "time_per_iteration": 2.4655556678771973 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111049, "balance_loss_mlp": 1.09515882, "diversity_loss_mlp": 0.0, "epoch": 0.3495575221238938, "flos": 461363558400.0, "grad_norm": 0.08460625336983617, "language_loss": 0.84522688, "learning_rate": 0.000755287233960883, "loss": 0.85633731, "num_input_tokens_seen": 150717200, "router_z_loss_mlp": 0.15881348, "routerloss_mlp": 0.0, "step": 1817, "time_per_iteration": 2.602492094039917 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089592, "balance_loss_mlp": 1.07385683, "diversity_loss_mlp": 0.0, "epoch": 0.3497499038091574, "flos": 724172576256.0, "grad_norm": 0.07045705340523431, "language_loss": 0.77682364, "learning_rate": 0.0007550193098960292, "loss": 0.78771949, "num_input_tokens_seen": 150790368, "router_z_loss_mlp": 0.15722656, "routerloss_mlp": 0.0, "step": 1818, "time_per_iteration": 2.8674800395965576 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00989642, "balance_loss_mlp": 1.73270237, "diversity_loss_mlp": 0.21087486, "epoch": 0.3499422854944209, "flos": 827729528832.0, "grad_norm": 0.029406524514427698, "language_loss": 0.86412024, "learning_rate": 0.0007547512868238988, "loss": 0.87401664, "num_input_tokens_seen": 150879872, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01785346, "step": 1819, "time_per_iteration": 3.151559829711914 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090552, "balance_loss_mlp": 1.07453036, "diversity_loss_mlp": 0.0, "epoch": 0.3501346671796845, "flos": 493479226368.0, "grad_norm": 0.06124546921927801, "language_loss": 0.83503008, "learning_rate": 0.0007544831648485473, "loss": 0.84593564, "num_input_tokens_seen": 150953712, "router_z_loss_mlp": 0.16015625, "routerloss_mlp": 0.0, "step": 1820, "time_per_iteration": 2.6791367530822754 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094234, "balance_loss_mlp": 1.07806909, "diversity_loss_mlp": 0.0, "epoch": 0.35032704886494803, "flos": 578752335360.0, "grad_norm": 0.08232155140582742, "language_loss": 0.81448233, "learning_rate": 0.0007542149440740694, "loss": 0.82542467, "num_input_tokens_seen": 151026192, "router_z_loss_mlp": 0.16162109, "routerloss_mlp": 0.0, "step": 1821, "time_per_iteration": 2.665632724761963 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088363, "balance_loss_mlp": 1.07229352, "diversity_loss_mlp": 0.0, "epoch": 0.3505194305502116, "flos": 584672338944.0, "grad_norm": 0.08177047744866778, "language_loss": 0.85514361, "learning_rate": 0.000753946624604597, "loss": 0.8660273, "num_input_tokens_seen": 151100720, "router_z_loss_mlp": 0.16064453, "routerloss_mlp": 0.0, "step": 1822, "time_per_iteration": 2.708221673965454 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085159, "balance_loss_mlp": 1.06938744, "diversity_loss_mlp": 0.0, "epoch": 0.3507118122354752, "flos": 526958991360.0, "grad_norm": 0.07022994660183399, "language_loss": 0.88119262, "learning_rate": 0.0007536782065443015, "loss": 0.89204431, "num_input_tokens_seen": 151166032, "router_z_loss_mlp": 0.15759277, "routerloss_mlp": 0.0, "step": 1823, "time_per_iteration": 2.633929967880249 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109141, "balance_loss_mlp": 1.0758059, "diversity_loss_mlp": 0.0, "epoch": 0.35090419392073874, "flos": 511523735040.0, "grad_norm": 0.09965750131036237, "language_loss": 0.75038946, "learning_rate": 0.0007534096899973919, "loss": 0.7613036, "num_input_tokens_seen": 151232208, "router_z_loss_mlp": 0.15588379, "routerloss_mlp": 0.0, "step": 1824, "time_per_iteration": 2.585160732269287 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089888, "balance_loss_mlp": 1.07460535, "diversity_loss_mlp": 0.0, "epoch": 0.3510965756060023, "flos": 564021149184.0, "grad_norm": 0.0636070515998131, "language_loss": 0.82941401, "learning_rate": 0.0007531410750681154, "loss": 0.84031284, "num_input_tokens_seen": 151308128, "router_z_loss_mlp": 0.15258789, "routerloss_mlp": 0.0, "step": 1825, "time_per_iteration": 2.7595911026000977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100118, "balance_loss_mlp": 1.08562207, "diversity_loss_mlp": 0.0, "epoch": 0.35128895729126586, "flos": 1020535137792.0, "grad_norm": 0.09267960960885083, "language_loss": 0.87015611, "learning_rate": 0.0007528723618607575, "loss": 0.88115728, "num_input_tokens_seen": 151402560, "router_z_loss_mlp": 0.14489746, "routerloss_mlp": 0.0, "step": 1826, "time_per_iteration": 3.4216692447662354 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090335, "balance_loss_mlp": 1.07524323, "diversity_loss_mlp": 0.0, "epoch": 0.35148133897652944, "flos": 588262445568.0, "grad_norm": 0.07214965975453298, "language_loss": 0.82582879, "learning_rate": 0.0007526035504796422, "loss": 0.83673215, "num_input_tokens_seen": 151478816, "router_z_loss_mlp": 0.15087891, "routerloss_mlp": 0.0, "step": 1827, "time_per_iteration": 2.7822000980377197 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094099, "balance_loss_mlp": 1.0794003, "diversity_loss_mlp": 0.0, "epoch": 0.351673720661793, "flos": 495300344832.0, "grad_norm": 0.07057247929289283, "language_loss": 0.86824054, "learning_rate": 0.0007523346410291312, "loss": 0.8791815, "num_input_tokens_seen": 151554528, "router_z_loss_mlp": 0.14660645, "routerloss_mlp": 0.0, "step": 1828, "time_per_iteration": 2.7560181617736816 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098232, "balance_loss_mlp": 1.08291376, "diversity_loss_mlp": 0.0, "epoch": 0.35186610234705656, "flos": 762670411776.0, "grad_norm": 0.0630617970486185, "language_loss": 0.85159689, "learning_rate": 0.0007520656336136245, "loss": 0.86257917, "num_input_tokens_seen": 151629440, "router_z_loss_mlp": 0.15307617, "routerloss_mlp": 0.0, "step": 1829, "time_per_iteration": 2.9432313442230225 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098974, "balance_loss_mlp": 1.08431172, "diversity_loss_mlp": 0.0, "epoch": 0.3520584840323201, "flos": 626135132160.0, "grad_norm": 0.06541232162591855, "language_loss": 0.88230217, "learning_rate": 0.0007517965283375599, "loss": 0.89329195, "num_input_tokens_seen": 151708544, "router_z_loss_mlp": 0.14660645, "routerloss_mlp": 0.0, "step": 1830, "time_per_iteration": 2.8773486614227295 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098552, "balance_loss_mlp": 1.08363926, "diversity_loss_mlp": 0.0, "epoch": 0.3522508657175837, "flos": 537388286976.0, "grad_norm": 0.06973135687475002, "language_loss": 0.89511967, "learning_rate": 0.0007515273253054132, "loss": 0.90610522, "num_input_tokens_seen": 151779152, "router_z_loss_mlp": 0.14892578, "routerloss_mlp": 0.0, "step": 1831, "time_per_iteration": 2.662757396697998 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097986, "balance_loss_mlp": 1.08288169, "diversity_loss_mlp": 0.0, "epoch": 0.35244324740284727, "flos": 567384030720.0, "grad_norm": 0.07142201858296882, "language_loss": 0.82785273, "learning_rate": 0.0007512580246216988, "loss": 0.83883256, "num_input_tokens_seen": 151853216, "router_z_loss_mlp": 0.15075684, "routerloss_mlp": 0.0, "step": 1832, "time_per_iteration": 2.730994939804077 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01096378, "balance_loss_mlp": 1.08164394, "diversity_loss_mlp": 0.0, "epoch": 0.3526356290881108, "flos": 513058157568.0, "grad_norm": 0.07119734441282773, "language_loss": 0.84715027, "learning_rate": 0.000750988626390968, "loss": 0.85811406, "num_input_tokens_seen": 151920416, "router_z_loss_mlp": 0.1472168, "routerloss_mlp": 0.0, "step": 1833, "time_per_iteration": 2.604182004928589 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089856, "balance_loss_mlp": 1.07508624, "diversity_loss_mlp": 0.0, "epoch": 0.3528280107733744, "flos": 595791023616.0, "grad_norm": 0.07060575001723658, "language_loss": 0.85089648, "learning_rate": 0.0007507191307178108, "loss": 0.86179501, "num_input_tokens_seen": 151990848, "router_z_loss_mlp": 0.14746094, "routerloss_mlp": 0.0, "step": 1834, "time_per_iteration": 2.7584774494171143 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083532, "balance_loss_mlp": 1.06808281, "diversity_loss_mlp": 0.0, "epoch": 0.3530203924586379, "flos": 551234792448.0, "grad_norm": 0.09392412586459238, "language_loss": 0.75105453, "learning_rate": 0.0007504495377068543, "loss": 0.76188982, "num_input_tokens_seen": 152064864, "router_z_loss_mlp": 0.15429688, "routerloss_mlp": 0.0, "step": 1835, "time_per_iteration": 2.731039524078369 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087025, "balance_loss_mlp": 1.07230306, "diversity_loss_mlp": 0.0, "epoch": 0.3532127741439015, "flos": 652990450176.0, "grad_norm": 0.09299008065025831, "language_loss": 0.81784093, "learning_rate": 0.0007501798474627642, "loss": 0.82871115, "num_input_tokens_seen": 152150096, "router_z_loss_mlp": 0.14697266, "routerloss_mlp": 0.0, "step": 1836, "time_per_iteration": 2.9180665016174316 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092183, "balance_loss_mlp": 1.07738876, "diversity_loss_mlp": 0.0, "epoch": 0.35340515582916504, "flos": 722791226880.0, "grad_norm": 0.06800399913452355, "language_loss": 0.8354817, "learning_rate": 0.0007499100600902433, "loss": 0.84640354, "num_input_tokens_seen": 152232528, "router_z_loss_mlp": 0.14782715, "routerloss_mlp": 0.0, "step": 1837, "time_per_iteration": 2.981478452682495 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097597, "balance_loss_mlp": 1.08236217, "diversity_loss_mlp": 0.0, "epoch": 0.35359753751442863, "flos": 594894233088.0, "grad_norm": 0.07178124654929893, "language_loss": 0.83625698, "learning_rate": 0.0007496401756940324, "loss": 0.84723294, "num_input_tokens_seen": 152299584, "router_z_loss_mlp": 0.15209961, "routerloss_mlp": 0.0, "step": 1838, "time_per_iteration": 2.7256877422332764 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01107267, "balance_loss_mlp": 1.09267545, "diversity_loss_mlp": 0.0, "epoch": 0.3537899191996922, "flos": 632668174848.0, "grad_norm": 0.08438072522416575, "language_loss": 0.81940264, "learning_rate": 0.0007493701943789098, "loss": 0.83047533, "num_input_tokens_seen": 152370368, "router_z_loss_mlp": 0.14575195, "routerloss_mlp": 0.0, "step": 1839, "time_per_iteration": 2.805553674697876 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117723, "balance_loss_mlp": 1.10266685, "diversity_loss_mlp": 0.0, "epoch": 0.35398230088495575, "flos": 506364701184.0, "grad_norm": 0.07000666511795951, "language_loss": 0.82830888, "learning_rate": 0.000749100116249692, "loss": 0.83948612, "num_input_tokens_seen": 152436928, "router_z_loss_mlp": 0.1505127, "routerloss_mlp": 0.0, "step": 1840, "time_per_iteration": 2.608135223388672 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00954188, "balance_loss_mlp": 1.66862321, "diversity_loss_mlp": 0.20571998, "epoch": 0.35417468257021933, "flos": 508034944512.0, "grad_norm": 0.03743173710930313, "language_loss": 0.86076337, "learning_rate": 0.0007488299414112321, "loss": 0.87030524, "num_input_tokens_seen": 152505952, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01701665, "step": 1841, "time_per_iteration": 2.6307811737060547 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112413, "balance_loss_mlp": 1.10974133, "diversity_loss_mlp": 0.0, "epoch": 0.35436706425548287, "flos": 656437395456.0, "grad_norm": 0.06710116446149988, "language_loss": 0.77204335, "learning_rate": 0.0007485596699684215, "loss": 0.78328466, "num_input_tokens_seen": 152577408, "router_z_loss_mlp": 0.14367676, "routerloss_mlp": 0.0, "step": 1842, "time_per_iteration": 2.808776378631592 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01132611, "balance_loss_mlp": 1.11780548, "diversity_loss_mlp": 0.0, "epoch": 0.35455944594074645, "flos": 652634744832.0, "grad_norm": 0.07987851383877129, "language_loss": 0.85353696, "learning_rate": 0.000748289302026189, "loss": 0.86486304, "num_input_tokens_seen": 152654480, "router_z_loss_mlp": 0.14794922, "routerloss_mlp": 0.0, "step": 1843, "time_per_iteration": 2.8449106216430664 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01127963, "balance_loss_mlp": 1.11339569, "diversity_loss_mlp": 0.0, "epoch": 0.35475182762601, "flos": 848593262592.0, "grad_norm": 0.06918658934745357, "language_loss": 0.85752398, "learning_rate": 0.0007480188376895004, "loss": 0.86880362, "num_input_tokens_seen": 152732304, "router_z_loss_mlp": 0.14550781, "routerloss_mlp": 0.0, "step": 1844, "time_per_iteration": 3.0339298248291016 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01160602, "balance_loss_mlp": 1.15135121, "diversity_loss_mlp": 0.0, "epoch": 0.3549442093112736, "flos": 1521468043776.0, "grad_norm": 0.06421168097867443, "language_loss": 0.7381134, "learning_rate": 0.0007477482770633596, "loss": 0.74971944, "num_input_tokens_seen": 152965952, "router_z_loss_mlp": 0.09228516, "routerloss_mlp": 0.0, "step": 1845, "time_per_iteration": 4.932978391647339 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119726, "balance_loss_mlp": 1.10506296, "diversity_loss_mlp": 0.0, "epoch": 0.3551365909965371, "flos": 651411611136.0, "grad_norm": 0.08194467088107492, "language_loss": 0.78768218, "learning_rate": 0.0007474776202528074, "loss": 0.79887938, "num_input_tokens_seen": 153053088, "router_z_loss_mlp": 0.14660645, "routerloss_mlp": 0.0, "step": 1846, "time_per_iteration": 2.9188990592956543 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111713, "balance_loss_mlp": 1.1021452, "diversity_loss_mlp": 0.0, "epoch": 0.3553289726818007, "flos": 897458724864.0, "grad_norm": 0.08015412782248336, "language_loss": 0.80999184, "learning_rate": 0.000747206867362922, "loss": 0.82116312, "num_input_tokens_seen": 153129216, "router_z_loss_mlp": 0.14953613, "routerloss_mlp": 0.0, "step": 1847, "time_per_iteration": 3.0966272354125977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099814, "balance_loss_mlp": 1.085235, "diversity_loss_mlp": 0.0, "epoch": 0.3555213543670643, "flos": 688491394560.0, "grad_norm": 0.09857033029565816, "language_loss": 0.836568, "learning_rate": 0.0007469360184988194, "loss": 0.84756613, "num_input_tokens_seen": 153199360, "router_z_loss_mlp": 0.14562988, "routerloss_mlp": 0.0, "step": 1848, "time_per_iteration": 2.9021246433258057 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01104687, "balance_loss_mlp": 1.08986914, "diversity_loss_mlp": 0.0, "epoch": 0.3557137360523278, "flos": 538564432896.0, "grad_norm": 0.08185517170087683, "language_loss": 0.86821651, "learning_rate": 0.0007466650737656518, "loss": 0.8792634, "num_input_tokens_seen": 153269168, "router_z_loss_mlp": 0.14794922, "routerloss_mlp": 0.0, "step": 1849, "time_per_iteration": 2.615549325942993 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01102053, "balance_loss_mlp": 1.0876888, "diversity_loss_mlp": 0.0, "epoch": 0.3559061177375914, "flos": 402261520896.0, "grad_norm": 0.06916390030254578, "language_loss": 0.89687926, "learning_rate": 0.0007463940332686098, "loss": 0.9078998, "num_input_tokens_seen": 153333120, "router_z_loss_mlp": 0.14367676, "routerloss_mlp": 0.0, "step": 1850, "time_per_iteration": 2.497159242630005 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00931214, "balance_loss_mlp": 1.62144685, "diversity_loss_mlp": 0.20650919, "epoch": 0.35609849942285493, "flos": 696568398336.0, "grad_norm": 0.030410176313075864, "language_loss": 0.84120536, "learning_rate": 0.0007461228971129205, "loss": 0.85051751, "num_input_tokens_seen": 153407600, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01723633, "step": 1851, "time_per_iteration": 2.959170341491699 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00931448, "balance_loss_mlp": 1.62270963, "diversity_loss_mlp": 0.20620242, "epoch": 0.3562908811081185, "flos": 568928365056.0, "grad_norm": 0.03221270440610224, "language_loss": 0.85523784, "learning_rate": 0.0007458516654038483, "loss": 0.86455238, "num_input_tokens_seen": 153477408, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01699215, "step": 1852, "time_per_iteration": 2.6886868476867676 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01149336, "balance_loss_mlp": 1.13526964, "diversity_loss_mlp": 0.0, "epoch": 0.35648326279338205, "flos": 682386011136.0, "grad_norm": 0.06572834298852859, "language_loss": 0.86835778, "learning_rate": 0.0007455803382466946, "loss": 0.8798511, "num_input_tokens_seen": 153551888, "router_z_loss_mlp": 0.14074707, "routerloss_mlp": 0.0, "step": 1853, "time_per_iteration": 2.8323659896850586 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01151398, "balance_loss_mlp": 1.13686657, "diversity_loss_mlp": 0.0, "epoch": 0.35667564447864564, "flos": 629139737088.0, "grad_norm": 0.06349489422764842, "language_loss": 0.86956179, "learning_rate": 0.0007453089157467979, "loss": 0.88107574, "num_input_tokens_seen": 153626912, "router_z_loss_mlp": 0.1451416, "routerloss_mlp": 0.0, "step": 1854, "time_per_iteration": 2.817117929458618 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01151473, "balance_loss_mlp": 1.13687038, "diversity_loss_mlp": 0.0, "epoch": 0.35686802616390917, "flos": 814048579584.0, "grad_norm": 0.06687597930641362, "language_loss": 0.8221277, "learning_rate": 0.0007450373980095341, "loss": 0.83364242, "num_input_tokens_seen": 153711312, "router_z_loss_mlp": 0.14587402, "routerloss_mlp": 0.0, "step": 1855, "time_per_iteration": 3.0857772827148438 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01148466, "balance_loss_mlp": 1.13494754, "diversity_loss_mlp": 0.0, "epoch": 0.35706040784917276, "flos": 526178198016.0, "grad_norm": 0.0656889709190827, "language_loss": 0.86804116, "learning_rate": 0.0007447657851403155, "loss": 0.87952584, "num_input_tokens_seen": 153780208, "router_z_loss_mlp": 0.13549805, "routerloss_mlp": 0.0, "step": 1856, "time_per_iteration": 2.6962759494781494 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01144273, "balance_loss_mlp": 1.1303966, "diversity_loss_mlp": 0.0, "epoch": 0.35725278953443634, "flos": 511970844672.0, "grad_norm": 0.08894932465162153, "language_loss": 0.78988904, "learning_rate": 0.0007444940772445915, "loss": 0.80133176, "num_input_tokens_seen": 153853152, "router_z_loss_mlp": 0.13879395, "routerloss_mlp": 0.0, "step": 1857, "time_per_iteration": 2.752232551574707 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01122576, "balance_loss_mlp": 1.10860419, "diversity_loss_mlp": 0.0, "epoch": 0.3574451712196999, "flos": 487428171264.0, "grad_norm": 0.06705763345081875, "language_loss": 0.80129987, "learning_rate": 0.0007442222744278484, "loss": 0.81252563, "num_input_tokens_seen": 153924160, "router_z_loss_mlp": 0.13989258, "routerloss_mlp": 0.0, "step": 1858, "time_per_iteration": 2.638322591781616 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110883, "balance_loss_mlp": 1.09717393, "diversity_loss_mlp": 0.0, "epoch": 0.35763755290496346, "flos": 550671312384.0, "grad_norm": 0.05935371072747042, "language_loss": 0.8399322, "learning_rate": 0.0007439503767956099, "loss": 0.85104102, "num_input_tokens_seen": 153998688, "router_z_loss_mlp": 0.137146, "routerloss_mlp": 0.0, "step": 1859, "time_per_iteration": 2.699204921722412 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124434, "balance_loss_mlp": 1.11480188, "diversity_loss_mlp": 0.0, "epoch": 0.357829934590227, "flos": 1504083561984.0, "grad_norm": 0.03541879327423246, "language_loss": 0.79671603, "learning_rate": 0.0007436783844534352, "loss": 0.80796039, "num_input_tokens_seen": 154230960, "router_z_loss_mlp": 0.09619141, "routerloss_mlp": 0.0, "step": 1860, "time_per_iteration": 4.89499831199646 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089258, "balance_loss_mlp": 1.07479787, "diversity_loss_mlp": 0.0, "epoch": 0.3580223162754906, "flos": 568695997440.0, "grad_norm": 0.06413043417122823, "language_loss": 0.86215138, "learning_rate": 0.000743406297506922, "loss": 0.87304389, "num_input_tokens_seen": 154309104, "router_z_loss_mlp": 0.14465332, "routerloss_mlp": 0.0, "step": 1861, "time_per_iteration": 2.7184388637542725 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00919817, "balance_loss_mlp": 1.60078692, "diversity_loss_mlp": 0.20507258, "epoch": 0.3582146979607541, "flos": 626473585152.0, "grad_norm": 0.028510278569739433, "language_loss": 0.84439111, "learning_rate": 0.0007431341160617031, "loss": 0.8535893, "num_input_tokens_seen": 154387424, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01688758, "step": 1862, "time_per_iteration": 2.8915610313415527 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084391, "balance_loss_mlp": 1.06988358, "diversity_loss_mlp": 0.0, "epoch": 0.3584070796460177, "flos": 507271403520.0, "grad_norm": 0.06954606141633879, "language_loss": 0.88100171, "learning_rate": 0.0007428618402234491, "loss": 0.8918457, "num_input_tokens_seen": 154459952, "router_z_loss_mlp": 0.14501953, "routerloss_mlp": 0.0, "step": 1863, "time_per_iteration": 2.6724555492401123 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087919, "balance_loss_mlp": 1.0733279, "diversity_loss_mlp": 0.0, "epoch": 0.3585994613312813, "flos": 606479851008.0, "grad_norm": 0.07542508091229044, "language_loss": 0.80288851, "learning_rate": 0.0007425894700978668, "loss": 0.81376767, "num_input_tokens_seen": 154535456, "router_z_loss_mlp": 0.14587402, "routerloss_mlp": 0.0, "step": 1864, "time_per_iteration": 2.724853038787842 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083866, "balance_loss_mlp": 1.06996608, "diversity_loss_mlp": 0.0, "epoch": 0.3587918430165448, "flos": 1412886799872.0, "grad_norm": 0.07695346444963648, "language_loss": 0.7981261, "learning_rate": 0.0007423170057906996, "loss": 0.80896473, "num_input_tokens_seen": 154627568, "router_z_loss_mlp": 0.13916016, "routerloss_mlp": 0.0, "step": 1865, "time_per_iteration": 3.9006779193878174 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108627, "balance_loss_mlp": 1.0722512, "diversity_loss_mlp": 0.0, "epoch": 0.3589842247018084, "flos": 478553121792.0, "grad_norm": 0.07814080760266444, "language_loss": 0.86228722, "learning_rate": 0.0007420444474077275, "loss": 0.87314993, "num_input_tokens_seen": 154694640, "router_z_loss_mlp": 0.14025879, "routerloss_mlp": 0.0, "step": 1866, "time_per_iteration": 2.546194076538086 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095193, "balance_loss_mlp": 1.0812335, "diversity_loss_mlp": 0.0, "epoch": 0.35917660638707194, "flos": 504711710208.0, "grad_norm": 0.0773553058948038, "language_loss": 0.8949936, "learning_rate": 0.0007417717950547671, "loss": 0.90594554, "num_input_tokens_seen": 154762048, "router_z_loss_mlp": 0.13964844, "routerloss_mlp": 0.0, "step": 1867, "time_per_iteration": 2.5670700073242188 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052493, "balance_loss_mlp": 1.04262233, "diversity_loss_mlp": 0.0, "epoch": 0.3593689880723355, "flos": 1492129382400.0, "grad_norm": 0.023944930622272237, "language_loss": 0.75996608, "learning_rate": 0.0007414990488376713, "loss": 0.770491, "num_input_tokens_seen": 154989952, "router_z_loss_mlp": 0.09863281, "routerloss_mlp": 0.0, "step": 1868, "time_per_iteration": 4.900780200958252 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101302, "balance_loss_mlp": 1.087533, "diversity_loss_mlp": 0.0, "epoch": 0.35956136975759906, "flos": 528629234688.0, "grad_norm": 0.06547244306940128, "language_loss": 0.84938717, "learning_rate": 0.0007412262088623299, "loss": 0.86040014, "num_input_tokens_seen": 155066992, "router_z_loss_mlp": 0.13793945, "routerloss_mlp": 0.0, "step": 1869, "time_per_iteration": 2.7674195766448975 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0092029, "balance_loss_mlp": 1.60128522, "diversity_loss_mlp": 0.20662443, "epoch": 0.35975375144286265, "flos": 534917426688.0, "grad_norm": 0.03542659619783611, "language_loss": 0.79155517, "learning_rate": 0.0007409532752346684, "loss": 0.80075806, "num_input_tokens_seen": 155137616, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01633519, "step": 1870, "time_per_iteration": 2.7116785049438477 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111485, "balance_loss_mlp": 1.101367, "diversity_loss_mlp": 0.0, "epoch": 0.3599461331281262, "flos": 504941506560.0, "grad_norm": 0.061502004439029076, "language_loss": 0.8836326, "learning_rate": 0.0007406802480606491, "loss": 0.89478111, "num_input_tokens_seen": 155209248, "router_z_loss_mlp": 0.13500977, "routerloss_mlp": 0.0, "step": 1871, "time_per_iteration": 2.642608165740967 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105605, "balance_loss_mlp": 1.0916698, "diversity_loss_mlp": 0.0, "epoch": 0.36013851481338977, "flos": 511533646848.0, "grad_norm": 0.06939665757215846, "language_loss": 0.90353388, "learning_rate": 0.0007404071274462707, "loss": 0.91458994, "num_input_tokens_seen": 155274176, "router_z_loss_mlp": 0.1394043, "routerloss_mlp": 0.0, "step": 1872, "time_per_iteration": 2.5600955486297607 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113518, "balance_loss_mlp": 1.09967744, "diversity_loss_mlp": 0.0, "epoch": 0.36033089649865335, "flos": 547590357504.0, "grad_norm": 0.07241097832053987, "language_loss": 0.83719409, "learning_rate": 0.0007401339134975682, "loss": 0.84832925, "num_input_tokens_seen": 155343232, "router_z_loss_mlp": 0.1385498, "routerloss_mlp": 0.0, "step": 1873, "time_per_iteration": 2.6775293350219727 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111162, "balance_loss_mlp": 1.09724998, "diversity_loss_mlp": 0.0, "epoch": 0.3605232781839169, "flos": 458655561216.0, "grad_norm": 0.07980684605652169, "language_loss": 0.84604299, "learning_rate": 0.0007398606063206122, "loss": 0.85715467, "num_input_tokens_seen": 155410080, "router_z_loss_mlp": 0.13928223, "routerloss_mlp": 0.0, "step": 1874, "time_per_iteration": 2.6092889308929443 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01109131, "balance_loss_mlp": 1.09546924, "diversity_loss_mlp": 0.0, "epoch": 0.36071565986918047, "flos": 509559455232.0, "grad_norm": 0.09304103013369584, "language_loss": 0.78818524, "learning_rate": 0.0007395872060215101, "loss": 0.79927647, "num_input_tokens_seen": 155476240, "router_z_loss_mlp": 0.13684082, "routerloss_mlp": 0.0, "step": 1875, "time_per_iteration": 2.5999374389648438 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124779, "balance_loss_mlp": 1.11121297, "diversity_loss_mlp": 0.0, "epoch": 0.360908041554444, "flos": 559195799040.0, "grad_norm": 0.08049441369365674, "language_loss": 0.8851527, "learning_rate": 0.0007393137127064056, "loss": 0.89640045, "num_input_tokens_seen": 155543392, "router_z_loss_mlp": 0.13574219, "routerloss_mlp": 0.0, "step": 1876, "time_per_iteration": 2.635896682739258 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01127435, "balance_loss_mlp": 1.11380959, "diversity_loss_mlp": 0.0, "epoch": 0.3611004232397076, "flos": 523845729792.0, "grad_norm": 0.06613177233605298, "language_loss": 0.84377646, "learning_rate": 0.0007390401264814779, "loss": 0.8550508, "num_input_tokens_seen": 155613264, "router_z_loss_mlp": 0.13635254, "routerloss_mlp": 0.0, "step": 1877, "time_per_iteration": 2.597508192062378 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01151319, "balance_loss_mlp": 1.1378243, "diversity_loss_mlp": 0.0, "epoch": 0.3612928049249711, "flos": 540988305408.0, "grad_norm": 0.09083655630754779, "language_loss": 0.84454513, "learning_rate": 0.0007387664474529427, "loss": 0.8560583, "num_input_tokens_seen": 155683712, "router_z_loss_mlp": 0.13525391, "routerloss_mlp": 0.0, "step": 1878, "time_per_iteration": 2.6493661403656006 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01143725, "balance_loss_mlp": 1.1302073, "diversity_loss_mlp": 0.0, "epoch": 0.3614851866102347, "flos": 552556670976.0, "grad_norm": 0.0643860955644754, "language_loss": 0.91379291, "learning_rate": 0.0007384926757270518, "loss": 0.92523015, "num_input_tokens_seen": 155751760, "router_z_loss_mlp": 0.13537598, "routerloss_mlp": 0.0, "step": 1879, "time_per_iteration": 2.62565016746521 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01152012, "balance_loss_mlp": 1.13819528, "diversity_loss_mlp": 0.0, "epoch": 0.36167756829549824, "flos": 772071865344.0, "grad_norm": 0.07609143241795291, "language_loss": 0.80057949, "learning_rate": 0.0007382188114100924, "loss": 0.81209958, "num_input_tokens_seen": 155830464, "router_z_loss_mlp": 0.13818359, "routerloss_mlp": 0.0, "step": 1880, "time_per_iteration": 2.974212169647217 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01155662, "balance_loss_mlp": 1.14148784, "diversity_loss_mlp": 0.0, "epoch": 0.36186994998076183, "flos": 711885086208.0, "grad_norm": 0.0632350243804942, "language_loss": 0.8182314, "learning_rate": 0.0007379448546083884, "loss": 0.82978803, "num_input_tokens_seen": 155906208, "router_z_loss_mlp": 0.14160156, "routerloss_mlp": 0.0, "step": 1881, "time_per_iteration": 2.894099712371826 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01154364, "balance_loss_mlp": 1.14052355, "diversity_loss_mlp": 0.0, "epoch": 0.3620623316660254, "flos": 747546444288.0, "grad_norm": 0.06232367753538678, "language_loss": 0.8822301, "learning_rate": 0.0007376708054282992, "loss": 0.89377379, "num_input_tokens_seen": 155983584, "router_z_loss_mlp": 0.1385498, "routerloss_mlp": 0.0, "step": 1882, "time_per_iteration": 2.9576163291931152 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01162916, "balance_loss_mlp": 1.14919519, "diversity_loss_mlp": 0.0, "epoch": 0.36225471335128895, "flos": 482555833344.0, "grad_norm": 0.06608098206448941, "language_loss": 0.83563071, "learning_rate": 0.0007373966639762201, "loss": 0.84725988, "num_input_tokens_seen": 156052464, "router_z_loss_mlp": 0.13757324, "routerloss_mlp": 0.0, "step": 1883, "time_per_iteration": 2.6004068851470947 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01158732, "balance_loss_mlp": 1.14478457, "diversity_loss_mlp": 0.0, "epoch": 0.36244709503655254, "flos": 506905786368.0, "grad_norm": 0.07441448138889938, "language_loss": 0.88544619, "learning_rate": 0.0007371224303585822, "loss": 0.89703357, "num_input_tokens_seen": 156121424, "router_z_loss_mlp": 0.13964844, "routerloss_mlp": 0.0, "step": 1884, "time_per_iteration": 2.5741078853607178 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01109461, "balance_loss_mlp": 1.09897089, "diversity_loss_mlp": 0.0, "epoch": 0.36263947672181607, "flos": 1394050466304.0, "grad_norm": 0.03545085729862102, "language_loss": 0.80357069, "learning_rate": 0.0007368481046818524, "loss": 0.81466532, "num_input_tokens_seen": 156346144, "router_z_loss_mlp": 0.10498047, "routerloss_mlp": 0.0, "step": 1885, "time_per_iteration": 4.706872224807739 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01148036, "balance_loss_mlp": 1.13442218, "diversity_loss_mlp": 0.0, "epoch": 0.36283185840707965, "flos": 653296969728.0, "grad_norm": 0.0691831634947964, "language_loss": 0.8278423, "learning_rate": 0.0007365736870525335, "loss": 0.83932269, "num_input_tokens_seen": 156420880, "router_z_loss_mlp": 0.13635254, "routerloss_mlp": 0.0, "step": 1886, "time_per_iteration": 2.8480284214019775 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135823, "balance_loss_mlp": 1.12236464, "diversity_loss_mlp": 0.0, "epoch": 0.3630242400923432, "flos": 488863848960.0, "grad_norm": 0.0786816251155578, "language_loss": 0.82659888, "learning_rate": 0.000736299177577164, "loss": 0.83795714, "num_input_tokens_seen": 156485616, "router_z_loss_mlp": 0.13476562, "routerloss_mlp": 0.0, "step": 1887, "time_per_iteration": 2.601449966430664 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01127197, "balance_loss_mlp": 1.11358309, "diversity_loss_mlp": 0.0, "epoch": 0.3632166217776068, "flos": 517159613952.0, "grad_norm": 0.0767010159800114, "language_loss": 0.8381778, "learning_rate": 0.0007360245763623174, "loss": 0.84944975, "num_input_tokens_seen": 156557840, "router_z_loss_mlp": 0.13635254, "routerloss_mlp": 0.0, "step": 1888, "time_per_iteration": 2.6951138973236084 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106263, "balance_loss_mlp": 1.09350717, "diversity_loss_mlp": 0.0, "epoch": 0.36340900346287036, "flos": 646173656064.0, "grad_norm": 0.06311908909694558, "language_loss": 0.89886129, "learning_rate": 0.0007357498835146039, "loss": 0.90992391, "num_input_tokens_seen": 156632496, "router_z_loss_mlp": 0.12768555, "routerloss_mlp": 0.0, "step": 1889, "time_per_iteration": 2.8509137630462646 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094399, "balance_loss_mlp": 1.08141732, "diversity_loss_mlp": 0.0, "epoch": 0.3636013851481339, "flos": 553327552512.0, "grad_norm": 0.06820711534899371, "language_loss": 0.86674547, "learning_rate": 0.0007354750991406684, "loss": 0.87768942, "num_input_tokens_seen": 156705296, "router_z_loss_mlp": 0.13000488, "routerloss_mlp": 0.0, "step": 1890, "time_per_iteration": 2.7162795066833496 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089716, "balance_loss_mlp": 1.07673419, "diversity_loss_mlp": 0.0, "epoch": 0.3637937668333975, "flos": 546653919744.0, "grad_norm": 0.07876014589837055, "language_loss": 0.80930853, "learning_rate": 0.0007352002233471919, "loss": 0.82020569, "num_input_tokens_seen": 156773376, "router_z_loss_mlp": 0.12988281, "routerloss_mlp": 0.0, "step": 1891, "time_per_iteration": 2.631824016571045 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091374, "balance_loss_mlp": 1.07835662, "diversity_loss_mlp": 0.0, "epoch": 0.363986148518661, "flos": 538112180736.0, "grad_norm": 0.08103720744805817, "language_loss": 0.79372823, "learning_rate": 0.0007349252562408906, "loss": 0.80464196, "num_input_tokens_seen": 156844336, "router_z_loss_mlp": 0.13024902, "routerloss_mlp": 0.0, "step": 1892, "time_per_iteration": 2.6752734184265137 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097706, "balance_loss_mlp": 1.08496833, "diversity_loss_mlp": 0.0, "epoch": 0.3641785302039246, "flos": 660217651200.0, "grad_norm": 0.07356128462514616, "language_loss": 0.81490725, "learning_rate": 0.0007346501979285158, "loss": 0.82588428, "num_input_tokens_seen": 156918848, "router_z_loss_mlp": 0.12750244, "routerloss_mlp": 0.0, "step": 1893, "time_per_iteration": 2.8990893363952637 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01040684, "balance_loss_mlp": 1.03214884, "diversity_loss_mlp": 0.0, "epoch": 0.36437091188918813, "flos": 1468743031296.0, "grad_norm": 0.022756463517582398, "language_loss": 0.80539101, "learning_rate": 0.0007343750485168551, "loss": 0.81579787, "num_input_tokens_seen": 157134736, "router_z_loss_mlp": 0.08544922, "routerloss_mlp": 0.0, "step": 1894, "time_per_iteration": 4.8097145557403564 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098768, "balance_loss_mlp": 1.0857501, "diversity_loss_mlp": 0.0, "epoch": 0.3645632935744517, "flos": 597298281984.0, "grad_norm": 0.06969655176236832, "language_loss": 0.85880721, "learning_rate": 0.0007340998081127308, "loss": 0.86979485, "num_input_tokens_seen": 157211920, "router_z_loss_mlp": 0.13037109, "routerloss_mlp": 0.0, "step": 1895, "time_per_iteration": 2.757380485534668 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087705, "balance_loss_mlp": 1.074646, "diversity_loss_mlp": 0.0, "epoch": 0.36475567525971525, "flos": 599509610496.0, "grad_norm": 0.06910669114263218, "language_loss": 0.91127002, "learning_rate": 0.0007338244768230007, "loss": 0.92214715, "num_input_tokens_seen": 157284224, "router_z_loss_mlp": 0.13079834, "routerloss_mlp": 0.0, "step": 1896, "time_per_iteration": 2.7967634201049805 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098629, "balance_loss_mlp": 1.08584976, "diversity_loss_mlp": 0.0, "epoch": 0.36494805694497884, "flos": 798403350528.0, "grad_norm": 0.05804787602656793, "language_loss": 0.88684666, "learning_rate": 0.0007335490547545578, "loss": 0.89783299, "num_input_tokens_seen": 157367920, "router_z_loss_mlp": 0.12780762, "routerloss_mlp": 0.0, "step": 1897, "time_per_iteration": 3.086498260498047 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095377, "balance_loss_mlp": 1.08286643, "diversity_loss_mlp": 0.0, "epoch": 0.3651404386302424, "flos": 637313287680.0, "grad_norm": 0.06953546528053214, "language_loss": 0.82679451, "learning_rate": 0.0007332735420143308, "loss": 0.83774823, "num_input_tokens_seen": 157438672, "router_z_loss_mlp": 0.12506104, "routerloss_mlp": 0.0, "step": 1898, "time_per_iteration": 2.788245439529419 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097867, "balance_loss_mlp": 1.08476591, "diversity_loss_mlp": 0.0, "epoch": 0.36533282031550596, "flos": 491581757952.0, "grad_norm": 0.07600656362423025, "language_loss": 0.86647844, "learning_rate": 0.0007329979387092826, "loss": 0.87745708, "num_input_tokens_seen": 157505888, "router_z_loss_mlp": 0.13110352, "routerloss_mlp": 0.0, "step": 1899, "time_per_iteration": 2.5437934398651123 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101105, "balance_loss_mlp": 1.08821869, "diversity_loss_mlp": 0.0, "epoch": 0.36552520200076954, "flos": 855970965504.0, "grad_norm": 0.05952938167480439, "language_loss": 0.83796108, "learning_rate": 0.0007327222449464124, "loss": 0.8489722, "num_input_tokens_seen": 157601568, "router_z_loss_mlp": 0.12902832, "routerloss_mlp": 0.0, "step": 1900, "time_per_iteration": 3.2824244499206543 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.011078, "balance_loss_mlp": 1.09499097, "diversity_loss_mlp": 0.0, "epoch": 0.3657175836860331, "flos": 483702243840.0, "grad_norm": 0.07745224305421915, "language_loss": 0.88634431, "learning_rate": 0.0007324464608327538, "loss": 0.89742231, "num_input_tokens_seen": 157670992, "router_z_loss_mlp": 0.12823486, "routerloss_mlp": 0.0, "step": 1901, "time_per_iteration": 2.6411991119384766 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01102996, "balance_loss_mlp": 1.08995461, "diversity_loss_mlp": 0.0, "epoch": 0.36590996537129666, "flos": 434792365056.0, "grad_norm": 0.08223816362142805, "language_loss": 0.88474846, "learning_rate": 0.0007321705864753758, "loss": 0.89577842, "num_input_tokens_seen": 157743616, "router_z_loss_mlp": 0.1305542, "routerloss_mlp": 0.0, "step": 1902, "time_per_iteration": 2.682002544403076 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00931657, "balance_loss_mlp": 1.62497878, "diversity_loss_mlp": 0.20707282, "epoch": 0.3661023470565602, "flos": 712206286848.0, "grad_norm": 0.026825446902959647, "language_loss": 0.84137708, "learning_rate": 0.0007318946219813823, "loss": 0.85069364, "num_input_tokens_seen": 157823520, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01563089, "step": 1903, "time_per_iteration": 3.0061404705047607 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108165, "balance_loss_mlp": 1.09403849, "diversity_loss_mlp": 0.0, "epoch": 0.3662947287418238, "flos": 564760097280.0, "grad_norm": 0.07526416733947026, "language_loss": 0.89736164, "learning_rate": 0.000731618567457912, "loss": 0.90844321, "num_input_tokens_seen": 157893248, "router_z_loss_mlp": 0.14105225, "routerloss_mlp": 0.0, "step": 1904, "time_per_iteration": 2.6523027420043945 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099252, "balance_loss_mlp": 1.08536446, "diversity_loss_mlp": 0.0, "epoch": 0.3664871104270873, "flos": 789752954880.0, "grad_norm": 0.07605082206895837, "language_loss": 0.87058568, "learning_rate": 0.000731342423012139, "loss": 0.88157821, "num_input_tokens_seen": 157973216, "router_z_loss_mlp": 0.13903809, "routerloss_mlp": 0.0, "step": 1905, "time_per_iteration": 3.0595312118530273 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01096318, "balance_loss_mlp": 1.08213234, "diversity_loss_mlp": 0.0, "epoch": 0.3666794921123509, "flos": 752557174272.0, "grad_norm": 0.07718853495225737, "language_loss": 0.82559443, "learning_rate": 0.0007310661887512722, "loss": 0.83655763, "num_input_tokens_seen": 158051088, "router_z_loss_mlp": 0.1418457, "routerloss_mlp": 0.0, "step": 1906, "time_per_iteration": 3.056859016418457 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090478, "balance_loss_mlp": 1.07672131, "diversity_loss_mlp": 0.0, "epoch": 0.3668718737976145, "flos": 523531869696.0, "grad_norm": 0.07458396044121823, "language_loss": 0.8194133, "learning_rate": 0.0007307898647825549, "loss": 0.83031803, "num_input_tokens_seen": 158124368, "router_z_loss_mlp": 0.13769531, "routerloss_mlp": 0.0, "step": 1907, "time_per_iteration": 2.670468807220459 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090408, "balance_loss_mlp": 1.07666349, "diversity_loss_mlp": 0.0, "epoch": 0.367064255482878, "flos": 571967474688.0, "grad_norm": 0.09231339543244264, "language_loss": 0.89368939, "learning_rate": 0.0007305134512132659, "loss": 0.90459347, "num_input_tokens_seen": 158191472, "router_z_loss_mlp": 0.13751221, "routerloss_mlp": 0.0, "step": 1908, "time_per_iteration": 2.6561663150787354 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091843, "balance_loss_mlp": 1.07826495, "diversity_loss_mlp": 0.0, "epoch": 0.3672566371681416, "flos": 447114359808.0, "grad_norm": 0.08913139219920335, "language_loss": 0.83308864, "learning_rate": 0.0007302369481507183, "loss": 0.84400707, "num_input_tokens_seen": 158254384, "router_z_loss_mlp": 0.13592529, "routerloss_mlp": 0.0, "step": 1909, "time_per_iteration": 2.5485799312591553 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01017138, "balance_loss_mlp": 1.00979447, "diversity_loss_mlp": 0.0, "epoch": 0.36744901885340514, "flos": 1540090713600.0, "grad_norm": 0.013277678950868657, "language_loss": 0.79961759, "learning_rate": 0.00072996035570226, "loss": 0.80978894, "num_input_tokens_seen": 158486160, "router_z_loss_mlp": 0.07324219, "routerloss_mlp": 0.0, "step": 1910, "time_per_iteration": 4.848855257034302 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111697, "balance_loss_mlp": 1.09842944, "diversity_loss_mlp": 0.0, "epoch": 0.36764140053866873, "flos": 563685267456.0, "grad_norm": 0.058739485749840115, "language_loss": 0.85315347, "learning_rate": 0.000729683673975274, "loss": 0.86427045, "num_input_tokens_seen": 158555616, "router_z_loss_mlp": 0.13287354, "routerloss_mlp": 0.0, "step": 1911, "time_per_iteration": 2.690218210220337 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01114863, "balance_loss_mlp": 1.10165429, "diversity_loss_mlp": 0.0, "epoch": 0.36783378222393226, "flos": 1216663981056.0, "grad_norm": 0.05585809452393386, "language_loss": 0.8291769, "learning_rate": 0.0007294069030771774, "loss": 0.84032547, "num_input_tokens_seen": 158653984, "router_z_loss_mlp": 0.13232422, "routerloss_mlp": 0.0, "step": 1912, "time_per_iteration": 3.678927183151245 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01125561, "balance_loss_mlp": 1.1124301, "diversity_loss_mlp": 0.0, "epoch": 0.36802616390919585, "flos": 498724895232.0, "grad_norm": 0.06389765233013874, "language_loss": 0.90667701, "learning_rate": 0.0007291300431154224, "loss": 0.91793263, "num_input_tokens_seen": 158719728, "router_z_loss_mlp": 0.13140869, "routerloss_mlp": 0.0, "step": 1913, "time_per_iteration": 2.616999387741089 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043841, "balance_loss_mlp": 1.03611672, "diversity_loss_mlp": 0.0, "epoch": 0.36821854559445943, "flos": 1582146349056.0, "grad_norm": 0.02051984405011318, "language_loss": 0.70389736, "learning_rate": 0.0007288530941974955, "loss": 0.7143358, "num_input_tokens_seen": 158952544, "router_z_loss_mlp": 0.07714844, "routerloss_mlp": 0.0, "step": 1914, "time_per_iteration": 4.973980903625488 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01137116, "balance_loss_mlp": 1.12441444, "diversity_loss_mlp": 0.0, "epoch": 0.36841092727972297, "flos": 835626295296.0, "grad_norm": 0.0814243559806059, "language_loss": 0.7981922, "learning_rate": 0.0007285760564309179, "loss": 0.8095634, "num_input_tokens_seen": 159039680, "router_z_loss_mlp": 0.1270752, "routerloss_mlp": 0.0, "step": 1915, "time_per_iteration": 3.091447353363037 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01127931, "balance_loss_mlp": 1.11485386, "diversity_loss_mlp": 0.0, "epoch": 0.36860330896498655, "flos": 689855118336.0, "grad_norm": 0.09574055809111115, "language_loss": 0.84848046, "learning_rate": 0.0007282989299232448, "loss": 0.85975981, "num_input_tokens_seen": 159128128, "router_z_loss_mlp": 0.13092041, "routerloss_mlp": 0.0, "step": 1916, "time_per_iteration": 3.074547052383423 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113017, "balance_loss_mlp": 1.09977341, "diversity_loss_mlp": 0.0, "epoch": 0.3687956906502501, "flos": 554182497792.0, "grad_norm": 0.08763204320127825, "language_loss": 0.83209801, "learning_rate": 0.0007280217147820668, "loss": 0.84322822, "num_input_tokens_seen": 159193248, "router_z_loss_mlp": 0.13256836, "routerloss_mlp": 0.0, "step": 1917, "time_per_iteration": 2.6260228157043457 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092055, "balance_loss_mlp": 1.07888198, "diversity_loss_mlp": 0.0, "epoch": 0.3689880723355137, "flos": 576703991808.0, "grad_norm": 0.06316346716689762, "language_loss": 0.79465461, "learning_rate": 0.0007277444111150079, "loss": 0.80557513, "num_input_tokens_seen": 159265824, "router_z_loss_mlp": 0.13189697, "routerloss_mlp": 0.0, "step": 1918, "time_per_iteration": 2.6777923107147217 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088544, "balance_loss_mlp": 1.07465601, "diversity_loss_mlp": 0.0, "epoch": 0.3691804540207772, "flos": 528868942848.0, "grad_norm": 0.09595367080188737, "language_loss": 0.84512901, "learning_rate": 0.0007274670190297272, "loss": 0.85601443, "num_input_tokens_seen": 159332992, "router_z_loss_mlp": 0.13891602, "routerloss_mlp": 0.0, "step": 1919, "time_per_iteration": 2.590839147567749 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085845, "balance_loss_mlp": 1.07205224, "diversity_loss_mlp": 0.0, "epoch": 0.3693728357060408, "flos": 561019115520.0, "grad_norm": 0.07431087712553297, "language_loss": 0.82079387, "learning_rate": 0.0007271895386339179, "loss": 0.83165228, "num_input_tokens_seen": 159409808, "router_z_loss_mlp": 0.13806152, "routerloss_mlp": 0.0, "step": 1920, "time_per_iteration": 2.7924282550811768 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094639, "balance_loss_mlp": 1.08048892, "diversity_loss_mlp": 0.0, "epoch": 0.3695652173913043, "flos": 579770265600.0, "grad_norm": 0.07797312778631413, "language_loss": 0.83431751, "learning_rate": 0.0007269119700353073, "loss": 0.84526384, "num_input_tokens_seen": 159486128, "router_z_loss_mlp": 0.14160156, "routerloss_mlp": 0.0, "step": 1921, "time_per_iteration": 2.7155139446258545 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112357, "balance_loss_mlp": 1.0987196, "diversity_loss_mlp": 0.0, "epoch": 0.3697575990765679, "flos": 512914622976.0, "grad_norm": 0.07250682713227712, "language_loss": 0.84994757, "learning_rate": 0.0007266343133416571, "loss": 0.86107111, "num_input_tokens_seen": 159562224, "router_z_loss_mlp": 0.13647461, "routerloss_mlp": 0.0, "step": 1922, "time_per_iteration": 2.7394983768463135 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073276, "balance_loss_mlp": 1.06564641, "diversity_loss_mlp": 0.0, "epoch": 0.3699499807618315, "flos": 1570640025600.0, "grad_norm": 0.035523530201468645, "language_loss": 0.77116919, "learning_rate": 0.0007263565686607632, "loss": 0.78190196, "num_input_tokens_seen": 159784768, "router_z_loss_mlp": 0.07617188, "routerloss_mlp": 0.0, "step": 1923, "time_per_iteration": 4.877161026000977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115366, "balance_loss_mlp": 1.10153794, "diversity_loss_mlp": 0.0, "epoch": 0.37014236244709503, "flos": 497338776576.0, "grad_norm": 0.0789330271899564, "language_loss": 0.84356588, "learning_rate": 0.0007260787361004556, "loss": 0.85471952, "num_input_tokens_seen": 159848608, "router_z_loss_mlp": 0.13830566, "routerloss_mlp": 0.0, "step": 1924, "time_per_iteration": 2.608745813369751 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0103691, "balance_loss_mlp": 1.02985299, "diversity_loss_mlp": 0.0, "epoch": 0.3703347441323586, "flos": 1444368485376.0, "grad_norm": 0.021371165562314075, "language_loss": 0.73761505, "learning_rate": 0.0007258008157685987, "loss": 0.74798417, "num_input_tokens_seen": 160080928, "router_z_loss_mlp": 0.07080078, "routerloss_mlp": 0.0, "step": 1925, "time_per_iteration": 4.906585931777954 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01114233, "balance_loss_mlp": 1.10069048, "diversity_loss_mlp": 0.0, "epoch": 0.37052712581762215, "flos": 563601203712.0, "grad_norm": 0.12026638393290963, "language_loss": 0.87422252, "learning_rate": 0.0007255228077730903, "loss": 0.88536477, "num_input_tokens_seen": 160148976, "router_z_loss_mlp": 0.13549805, "routerloss_mlp": 0.0, "step": 1926, "time_per_iteration": 2.6886680126190186 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01123836, "balance_loss_mlp": 1.11107421, "diversity_loss_mlp": 0.0, "epoch": 0.37071950750288574, "flos": 926078261760.0, "grad_norm": 0.06719853297068734, "language_loss": 0.81722987, "learning_rate": 0.0007252447122218632, "loss": 0.82846814, "num_input_tokens_seen": 160233504, "router_z_loss_mlp": 0.12768555, "routerloss_mlp": 0.0, "step": 1927, "time_per_iteration": 3.1511058807373047 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01125512, "balance_loss_mlp": 1.11258984, "diversity_loss_mlp": 0.0, "epoch": 0.37091188918814927, "flos": 418312014336.0, "grad_norm": 0.08764579691953547, "language_loss": 0.87849444, "learning_rate": 0.0007249665292228834, "loss": 0.88974959, "num_input_tokens_seen": 160299696, "router_z_loss_mlp": 0.12939453, "routerloss_mlp": 0.0, "step": 1928, "time_per_iteration": 2.565991163253784 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01120289, "balance_loss_mlp": 1.1073308, "diversity_loss_mlp": 0.0, "epoch": 0.37110427087341286, "flos": 463182105600.0, "grad_norm": 0.0633685198143462, "language_loss": 0.83318496, "learning_rate": 0.000724688258884151, "loss": 0.84438789, "num_input_tokens_seen": 160367904, "router_z_loss_mlp": 0.12963867, "routerloss_mlp": 0.0, "step": 1929, "time_per_iteration": 2.531827926635742 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115297, "balance_loss_mlp": 1.10286927, "diversity_loss_mlp": 0.0, "epoch": 0.3712966525586764, "flos": 849658180608.0, "grad_norm": 0.05744658583323744, "language_loss": 0.86564112, "learning_rate": 0.0007244099013137002, "loss": 0.8767941, "num_input_tokens_seen": 160453600, "router_z_loss_mlp": 0.12432861, "routerloss_mlp": 0.0, "step": 1930, "time_per_iteration": 3.1130166053771973 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01116667, "balance_loss_mlp": 1.10404849, "diversity_loss_mlp": 0.0, "epoch": 0.37148903424394, "flos": 925954550784.0, "grad_norm": 0.06880018611034966, "language_loss": 0.88695574, "learning_rate": 0.0007241314566195993, "loss": 0.89812243, "num_input_tokens_seen": 160543472, "router_z_loss_mlp": 0.1262207, "routerloss_mlp": 0.0, "step": 1931, "time_per_iteration": 3.374743700027466 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110097, "balance_loss_mlp": 1.08821416, "diversity_loss_mlp": 0.0, "epoch": 0.37168141592920356, "flos": 519815854080.0, "grad_norm": 0.06303779661636588, "language_loss": 0.85510373, "learning_rate": 0.0007238529249099496, "loss": 0.86611342, "num_input_tokens_seen": 160614016, "router_z_loss_mlp": 0.12750244, "routerloss_mlp": 0.0, "step": 1932, "time_per_iteration": 2.6654059886932373 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097731, "balance_loss_mlp": 1.0911988, "diversity_loss_mlp": 0.0, "epoch": 0.3718737976144671, "flos": 1445895567360.0, "grad_norm": 0.03412398452916775, "language_loss": 0.77856874, "learning_rate": 0.0007235743062928872, "loss": 0.78954613, "num_input_tokens_seen": 160828640, "router_z_loss_mlp": 0.06542969, "routerloss_mlp": 0.0, "step": 1933, "time_per_iteration": 4.851354598999023 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091175, "balance_loss_mlp": 1.07859278, "diversity_loss_mlp": 0.0, "epoch": 0.3720661792997307, "flos": 759564490752.0, "grad_norm": 0.08014253307267598, "language_loss": 0.80636895, "learning_rate": 0.000723295600876581, "loss": 0.81728071, "num_input_tokens_seen": 160913088, "router_z_loss_mlp": 0.12597656, "routerloss_mlp": 0.0, "step": 1934, "time_per_iteration": 3.0025534629821777 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097044, "balance_loss_mlp": 1.08416963, "diversity_loss_mlp": 0.0, "epoch": 0.3722585609849942, "flos": 516956981760.0, "grad_norm": 0.08698689907724866, "language_loss": 0.88006312, "learning_rate": 0.0007230168087692344, "loss": 0.89103359, "num_input_tokens_seen": 160982960, "router_z_loss_mlp": 0.12872314, "routerloss_mlp": 0.0, "step": 1935, "time_per_iteration": 2.6499342918395996 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095813, "balance_loss_mlp": 1.0830214, "diversity_loss_mlp": 0.0, "epoch": 0.3724509426702578, "flos": 782464084992.0, "grad_norm": 0.07031074193849007, "language_loss": 0.82382512, "learning_rate": 0.0007227379300790839, "loss": 0.8347832, "num_input_tokens_seen": 161066000, "router_z_loss_mlp": 0.12805176, "routerloss_mlp": 0.0, "step": 1936, "time_per_iteration": 3.0040676593780518 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092601, "balance_loss_mlp": 1.07969058, "diversity_loss_mlp": 0.0, "epoch": 0.37264332435552133, "flos": 391720997376.0, "grad_norm": 0.07132774808829288, "language_loss": 0.85478282, "learning_rate": 0.0007224589649143997, "loss": 0.86570889, "num_input_tokens_seen": 161131040, "router_z_loss_mlp": 0.12915039, "routerloss_mlp": 0.0, "step": 1937, "time_per_iteration": 2.584545612335205 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089825, "balance_loss_mlp": 1.07662272, "diversity_loss_mlp": 0.0, "epoch": 0.3728357060407849, "flos": 542861180928.0, "grad_norm": 0.0711139803163438, "language_loss": 0.8120302, "learning_rate": 0.0007221799133834861, "loss": 0.82292843, "num_input_tokens_seen": 161201248, "router_z_loss_mlp": 0.13214111, "routerloss_mlp": 0.0, "step": 1938, "time_per_iteration": 2.6393649578094482 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109955, "balance_loss_mlp": 1.08649623, "diversity_loss_mlp": 0.0, "epoch": 0.3730280877260485, "flos": 433571802624.0, "grad_norm": 0.20460237815205612, "language_loss": 0.81793052, "learning_rate": 0.00072190077559468, "loss": 0.82892597, "num_input_tokens_seen": 161266288, "router_z_loss_mlp": 0.1307373, "routerloss_mlp": 0.0, "step": 1939, "time_per_iteration": 2.5494682788848877 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01127139, "balance_loss_mlp": 1.1140976, "diversity_loss_mlp": 0.0, "epoch": 0.37322046941131204, "flos": 531485535744.0, "grad_norm": 0.05817015695703163, "language_loss": 0.89248812, "learning_rate": 0.0007216215516563527, "loss": 0.90375948, "num_input_tokens_seen": 161335648, "router_z_loss_mlp": 0.13049316, "routerloss_mlp": 0.0, "step": 1940, "time_per_iteration": 2.6755452156066895 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01129035, "balance_loss_mlp": 1.1159811, "diversity_loss_mlp": 0.0, "epoch": 0.3734128510965756, "flos": 531549775872.0, "grad_norm": 0.07778932214282369, "language_loss": 0.83852386, "learning_rate": 0.0007213422416769083, "loss": 0.84981418, "num_input_tokens_seen": 161403440, "router_z_loss_mlp": 0.1307373, "routerloss_mlp": 0.0, "step": 1941, "time_per_iteration": 2.6008002758026123 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135994, "balance_loss_mlp": 1.12319708, "diversity_loss_mlp": 0.0, "epoch": 0.37360523278183916, "flos": 500442126336.0, "grad_norm": 0.06345716224902766, "language_loss": 0.7501297, "learning_rate": 0.0007210628457647849, "loss": 0.76148963, "num_input_tokens_seen": 161472864, "router_z_loss_mlp": 0.12811279, "routerloss_mlp": 0.0, "step": 1942, "time_per_iteration": 2.5911362171173096 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01140859, "balance_loss_mlp": 1.12763917, "diversity_loss_mlp": 0.0, "epoch": 0.37379761446710275, "flos": 547943491584.0, "grad_norm": 0.06753886702103719, "language_loss": 0.78585184, "learning_rate": 0.000720783364028453, "loss": 0.7972604, "num_input_tokens_seen": 161548096, "router_z_loss_mlp": 0.13238525, "routerloss_mlp": 0.0, "step": 1943, "time_per_iteration": 2.7490458488464355 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01149977, "balance_loss_mlp": 1.13685822, "diversity_loss_mlp": 0.0, "epoch": 0.3739899961523663, "flos": 475761060864.0, "grad_norm": 0.0650742437261564, "language_loss": 0.87667847, "learning_rate": 0.0007205037965764177, "loss": 0.88817823, "num_input_tokens_seen": 161615600, "router_z_loss_mlp": 0.13140869, "routerloss_mlp": 0.0, "step": 1944, "time_per_iteration": 2.5870554447174072 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01134812, "balance_loss_mlp": 1.12192512, "diversity_loss_mlp": 0.0, "epoch": 0.37418237783762986, "flos": 611915668992.0, "grad_norm": 0.07468357539719116, "language_loss": 0.85650361, "learning_rate": 0.0007202241435172161, "loss": 0.86785173, "num_input_tokens_seen": 161687408, "router_z_loss_mlp": 0.12902832, "routerloss_mlp": 0.0, "step": 1945, "time_per_iteration": 2.7550253868103027 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01131811, "balance_loss_mlp": 1.11901414, "diversity_loss_mlp": 0.0, "epoch": 0.3743747595228934, "flos": 766287682560.0, "grad_norm": 0.07270487210957549, "language_loss": 0.87884831, "learning_rate": 0.0007199444049594198, "loss": 0.8901664, "num_input_tokens_seen": 161764224, "router_z_loss_mlp": 0.12805176, "routerloss_mlp": 0.0, "step": 1946, "time_per_iteration": 2.9499337673187256 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111095, "balance_loss_mlp": 1.09783912, "diversity_loss_mlp": 0.0, "epoch": 0.374567141208157, "flos": 524394155520.0, "grad_norm": 0.07247382516020226, "language_loss": 0.83384776, "learning_rate": 0.0007196645810116322, "loss": 0.84495866, "num_input_tokens_seen": 161835520, "router_z_loss_mlp": 0.13269043, "routerloss_mlp": 0.0, "step": 1947, "time_per_iteration": 2.70394229888916 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113218, "balance_loss_mlp": 1.1003499, "diversity_loss_mlp": 0.0, "epoch": 0.37475952289342057, "flos": 681375421440.0, "grad_norm": 0.07522309633784076, "language_loss": 0.84431696, "learning_rate": 0.0007193846717824912, "loss": 0.8554492, "num_input_tokens_seen": 161912000, "router_z_loss_mlp": 0.12884521, "routerloss_mlp": 0.0, "step": 1948, "time_per_iteration": 2.923752546310425 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01116002, "balance_loss_mlp": 1.10312748, "diversity_loss_mlp": 0.0, "epoch": 0.3749519045786841, "flos": 460291299840.0, "grad_norm": 0.06883561802065806, "language_loss": 0.88268626, "learning_rate": 0.0007191046773806669, "loss": 0.89384627, "num_input_tokens_seen": 161977296, "router_z_loss_mlp": 0.12878418, "routerloss_mlp": 0.0, "step": 1949, "time_per_iteration": 2.562816858291626 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108921, "balance_loss_mlp": 1.09593272, "diversity_loss_mlp": 0.0, "epoch": 0.3751442862639477, "flos": 954853443072.0, "grad_norm": 0.07969110082801287, "language_loss": 0.83211446, "learning_rate": 0.0007188245979148631, "loss": 0.84320366, "num_input_tokens_seen": 162051888, "router_z_loss_mlp": 0.13006592, "routerloss_mlp": 0.0, "step": 1950, "time_per_iteration": 3.193124294281006 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111725, "balance_loss_mlp": 1.09892154, "diversity_loss_mlp": 0.0, "epoch": 0.3753366679492112, "flos": 527747125248.0, "grad_norm": 0.07005872092850987, "language_loss": 0.87434363, "learning_rate": 0.0007185444334938157, "loss": 0.88546085, "num_input_tokens_seen": 162124384, "router_z_loss_mlp": 0.12811279, "routerloss_mlp": 0.0, "step": 1951, "time_per_iteration": 2.669201135635376 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101783, "balance_loss_mlp": 1.0892663, "diversity_loss_mlp": 0.0, "epoch": 0.3755290496344748, "flos": 521797386240.0, "grad_norm": 0.08195801919923047, "language_loss": 0.85047525, "learning_rate": 0.0007182641842262947, "loss": 0.86149311, "num_input_tokens_seen": 162191440, "router_z_loss_mlp": 0.12518311, "routerloss_mlp": 0.0, "step": 1952, "time_per_iteration": 2.602139472961426 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092197, "balance_loss_mlp": 1.07936394, "diversity_loss_mlp": 0.0, "epoch": 0.37572143131973834, "flos": 621121830912.0, "grad_norm": 0.07349771430020792, "language_loss": 0.77754879, "learning_rate": 0.0007179838502211022, "loss": 0.78847075, "num_input_tokens_seen": 162268480, "router_z_loss_mlp": 0.128479, "routerloss_mlp": 0.0, "step": 1953, "time_per_iteration": 2.85720157623291 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094498, "balance_loss_mlp": 1.08148086, "diversity_loss_mlp": 0.0, "epoch": 0.37591381300500193, "flos": 770962530816.0, "grad_norm": 0.0681681729591206, "language_loss": 0.86330736, "learning_rate": 0.0007177034315870738, "loss": 0.87425238, "num_input_tokens_seen": 162346752, "router_z_loss_mlp": 0.13024902, "routerloss_mlp": 0.0, "step": 1954, "time_per_iteration": 2.958862066268921 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101066, "balance_loss_mlp": 1.08803654, "diversity_loss_mlp": 0.0, "epoch": 0.37610619469026546, "flos": 520448343552.0, "grad_norm": 0.06642365438263753, "language_loss": 0.90809441, "learning_rate": 0.0007174229284330773, "loss": 0.91910505, "num_input_tokens_seen": 162415120, "router_z_loss_mlp": 0.13037109, "routerloss_mlp": 0.0, "step": 1955, "time_per_iteration": 2.5824947357177734 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108936, "balance_loss_mlp": 1.07642531, "diversity_loss_mlp": 0.0, "epoch": 0.37629857637552905, "flos": 598812880896.0, "grad_norm": 0.07788827503332588, "language_loss": 0.86705017, "learning_rate": 0.0007171423408680141, "loss": 0.87794375, "num_input_tokens_seen": 162493280, "router_z_loss_mlp": 0.12939453, "routerloss_mlp": 0.0, "step": 1956, "time_per_iteration": 2.8101606369018555 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00950311, "balance_loss_mlp": 1.6602329, "diversity_loss_mlp": 0.20739825, "epoch": 0.37649095806079264, "flos": 564952817664.0, "grad_norm": 0.03218717292019043, "language_loss": 0.89567441, "learning_rate": 0.0007168616690008176, "loss": 0.90517747, "num_input_tokens_seen": 162560736, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01649548, "step": 1957, "time_per_iteration": 2.6774377822875977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081569, "balance_loss_mlp": 1.06840825, "diversity_loss_mlp": 0.0, "epoch": 0.37668333974605617, "flos": 592470360576.0, "grad_norm": 0.07242251254882147, "language_loss": 0.85681045, "learning_rate": 0.0007165809129404545, "loss": 0.86762613, "num_input_tokens_seen": 162630688, "router_z_loss_mlp": 0.13171387, "routerloss_mlp": 0.0, "step": 1958, "time_per_iteration": 2.8396048545837402 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090105, "balance_loss_mlp": 1.07657433, "diversity_loss_mlp": 0.0, "epoch": 0.37687572143131975, "flos": 419478248448.0, "grad_norm": 0.08227545286248691, "language_loss": 0.86212921, "learning_rate": 0.0007163000727959239, "loss": 0.87303019, "num_input_tokens_seen": 162694304, "router_z_loss_mlp": 0.13562012, "routerloss_mlp": 0.0, "step": 1959, "time_per_iteration": 2.478990316390991 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087148, "balance_loss_mlp": 1.07989979, "diversity_loss_mlp": 0.0, "epoch": 0.3770681031165833, "flos": 1357262148096.0, "grad_norm": 0.05215322395932221, "language_loss": 0.77959073, "learning_rate": 0.0007160191486762575, "loss": 0.79046214, "num_input_tokens_seen": 162920336, "router_z_loss_mlp": 0.07226562, "routerloss_mlp": 0.0, "step": 1960, "time_per_iteration": 4.869986057281494 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095327, "balance_loss_mlp": 1.08232689, "diversity_loss_mlp": 0.0, "epoch": 0.3772604848018469, "flos": 644903534592.0, "grad_norm": 0.08048811275026858, "language_loss": 0.84568793, "learning_rate": 0.00071573814069052, "loss": 0.85664117, "num_input_tokens_seen": 163000720, "router_z_loss_mlp": 0.13018799, "routerloss_mlp": 0.0, "step": 1961, "time_per_iteration": 2.9122819900512695 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109901, "balance_loss_mlp": 1.08614171, "diversity_loss_mlp": 0.0, "epoch": 0.3774528664871104, "flos": 901651585536.0, "grad_norm": 0.06061063893945359, "language_loss": 0.88073885, "learning_rate": 0.0007154570489478081, "loss": 0.89172894, "num_input_tokens_seen": 163085680, "router_z_loss_mlp": 0.12878418, "routerloss_mlp": 0.0, "step": 1962, "time_per_iteration": 3.1824018955230713 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111399, "balance_loss_mlp": 1.10154414, "diversity_loss_mlp": 0.0, "epoch": 0.377645248172374, "flos": 788065459200.0, "grad_norm": 0.06274200702745775, "language_loss": 0.86391222, "learning_rate": 0.0007151758735572514, "loss": 0.87505209, "num_input_tokens_seen": 163162224, "router_z_loss_mlp": 0.12451172, "routerloss_mlp": 0.0, "step": 1963, "time_per_iteration": 2.997624158859253 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111089, "balance_loss_mlp": 1.09836888, "diversity_loss_mlp": 0.0, "epoch": 0.3778376298576376, "flos": 586718111232.0, "grad_norm": 0.07983075782925624, "language_loss": 0.80894458, "learning_rate": 0.0007148946146280119, "loss": 0.82005548, "num_input_tokens_seen": 163237920, "router_z_loss_mlp": 0.12731934, "routerloss_mlp": 0.0, "step": 1964, "time_per_iteration": 2.836583137512207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00620122, "balance_loss_mlp": 1.05382681, "diversity_loss_mlp": 0.16216688, "epoch": 0.3780300115429011, "flos": 1396743782400.0, "grad_norm": 0.0017779517528101797, "language_loss": 0.72192144, "learning_rate": 0.000714613272269284, "loss": 0.72812271, "num_input_tokens_seen": 163455760, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01212509, "step": 1965, "time_per_iteration": 4.906678915023804 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01030562, "balance_loss_mlp": 1.02436352, "diversity_loss_mlp": 0.0, "epoch": 0.3782223932281647, "flos": 1357672555008.0, "grad_norm": 0.025755206304302582, "language_loss": 0.75341946, "learning_rate": 0.0007143318465902943, "loss": 0.7637251, "num_input_tokens_seen": 163678064, "router_z_loss_mlp": 0.06176758, "routerloss_mlp": 0.0, "step": 1966, "time_per_iteration": 4.93319296836853 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01127949, "balance_loss_mlp": 1.11581361, "diversity_loss_mlp": 0.0, "epoch": 0.37841477491342823, "flos": 704151304704.0, "grad_norm": 0.05898800907157556, "language_loss": 0.83873129, "learning_rate": 0.0007140503377003022, "loss": 0.85001081, "num_input_tokens_seen": 163764320, "router_z_loss_mlp": 0.12127686, "routerloss_mlp": 0.0, "step": 1967, "time_per_iteration": 2.9807000160217285 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01123739, "balance_loss_mlp": 1.11125755, "diversity_loss_mlp": 0.0, "epoch": 0.3786071565986918, "flos": 529115991552.0, "grad_norm": 0.06421364750503517, "language_loss": 0.84625173, "learning_rate": 0.000713768745708599, "loss": 0.85748911, "num_input_tokens_seen": 163831808, "router_z_loss_mlp": 0.12481689, "routerloss_mlp": 0.0, "step": 1968, "time_per_iteration": 2.610745429992676 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118961, "balance_loss_mlp": 1.10671234, "diversity_loss_mlp": 0.0, "epoch": 0.37879953828395535, "flos": 993277126656.0, "grad_norm": 0.06880095080762995, "language_loss": 0.77052647, "learning_rate": 0.0007134870707245085, "loss": 0.78171611, "num_input_tokens_seen": 163918128, "router_z_loss_mlp": 0.12249756, "routerloss_mlp": 0.0, "step": 1969, "time_per_iteration": 3.302985429763794 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01120912, "balance_loss_mlp": 1.10852587, "diversity_loss_mlp": 0.0, "epoch": 0.37899191996921894, "flos": 626644283904.0, "grad_norm": 0.07142024228833302, "language_loss": 0.84469545, "learning_rate": 0.0007132053128573864, "loss": 0.85590458, "num_input_tokens_seen": 163987552, "router_z_loss_mlp": 0.12384033, "routerloss_mlp": 0.0, "step": 1970, "time_per_iteration": 2.7751197814941406 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124437, "balance_loss_mlp": 1.11231327, "diversity_loss_mlp": 0.0, "epoch": 0.37918430165448247, "flos": 686307230208.0, "grad_norm": 0.06795721743578591, "language_loss": 0.83786452, "learning_rate": 0.0007129234722166211, "loss": 0.84910882, "num_input_tokens_seen": 164063248, "router_z_loss_mlp": 0.12115479, "routerloss_mlp": 0.0, "step": 1971, "time_per_iteration": 2.806898832321167 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01114684, "balance_loss_mlp": 1.10238707, "diversity_loss_mlp": 0.0, "epoch": 0.37937668333974606, "flos": 475622668800.0, "grad_norm": 0.06601167392952549, "language_loss": 0.91087604, "learning_rate": 0.0007126415489116328, "loss": 0.92202282, "num_input_tokens_seen": 164133776, "router_z_loss_mlp": 0.1229248, "routerloss_mlp": 0.0, "step": 1972, "time_per_iteration": 2.656651496887207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01109875, "balance_loss_mlp": 1.09782279, "diversity_loss_mlp": 0.0, "epoch": 0.37956906502500964, "flos": 707580997632.0, "grad_norm": 0.06641244535011205, "language_loss": 0.81145501, "learning_rate": 0.0007123595430518736, "loss": 0.82255375, "num_input_tokens_seen": 164206672, "router_z_loss_mlp": 0.12042236, "routerloss_mlp": 0.0, "step": 1973, "time_per_iteration": 2.8665072917938232 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01102568, "balance_loss_mlp": 1.09068835, "diversity_loss_mlp": 0.0, "epoch": 0.3797614467102732, "flos": 426648549888.0, "grad_norm": 0.07235703206146665, "language_loss": 0.86411089, "learning_rate": 0.0007120774547468282, "loss": 0.87513655, "num_input_tokens_seen": 164271968, "router_z_loss_mlp": 0.11877441, "routerloss_mlp": 0.0, "step": 1974, "time_per_iteration": 2.5590381622314453 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00948323, "balance_loss_mlp": 1.65707994, "diversity_loss_mlp": 0.20756721, "epoch": 0.37995382839553676, "flos": 481846620672.0, "grad_norm": 0.03148003592885531, "language_loss": 0.81558585, "learning_rate": 0.0007117952841060128, "loss": 0.82506907, "num_input_tokens_seen": 164342800, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01599924, "step": 1975, "time_per_iteration": 2.6777563095092773 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083824, "balance_loss_mlp": 1.07167053, "diversity_loss_mlp": 0.0, "epoch": 0.3801462100808003, "flos": 560562094080.0, "grad_norm": 0.07660828670939425, "language_loss": 0.83672053, "learning_rate": 0.0007115130312389756, "loss": 0.8475588, "num_input_tokens_seen": 164414928, "router_z_loss_mlp": 0.12145996, "routerloss_mlp": 0.0, "step": 1976, "time_per_iteration": 2.7103323936462402 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084468, "balance_loss_mlp": 1.07200503, "diversity_loss_mlp": 0.0, "epoch": 0.3803385917660639, "flos": 464936412672.0, "grad_norm": 0.08353002189035653, "language_loss": 0.79290646, "learning_rate": 0.0007112306962552973, "loss": 0.80375111, "num_input_tokens_seen": 164483312, "router_z_loss_mlp": 0.12463379, "routerloss_mlp": 0.0, "step": 1977, "time_per_iteration": 2.576239824295044 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084774, "balance_loss_mlp": 1.07254314, "diversity_loss_mlp": 0.0, "epoch": 0.3805309734513274, "flos": 521871538176.0, "grad_norm": 0.06483406604645132, "language_loss": 0.85315859, "learning_rate": 0.0007109482792645896, "loss": 0.86400628, "num_input_tokens_seen": 164555760, "router_z_loss_mlp": 0.12237549, "routerloss_mlp": 0.0, "step": 1978, "time_per_iteration": 2.7146143913269043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084927, "balance_loss_mlp": 1.07276165, "diversity_loss_mlp": 0.0, "epoch": 0.380723355136591, "flos": 591412783104.0, "grad_norm": 0.06865418790878511, "language_loss": 0.83831733, "learning_rate": 0.0007106657803764969, "loss": 0.84916663, "num_input_tokens_seen": 164626768, "router_z_loss_mlp": 0.12158203, "routerloss_mlp": 0.0, "step": 1979, "time_per_iteration": 2.73152494430542 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01086146, "balance_loss_mlp": 1.07395101, "diversity_loss_mlp": 0.0, "epoch": 0.38091573682185453, "flos": 622685988864.0, "grad_norm": 0.07620298141647525, "language_loss": 0.81962979, "learning_rate": 0.0007103831997006948, "loss": 0.83049119, "num_input_tokens_seen": 164698016, "router_z_loss_mlp": 0.12194824, "routerloss_mlp": 0.0, "step": 1980, "time_per_iteration": 2.7383615970611572 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094611, "balance_loss_mlp": 1.08276772, "diversity_loss_mlp": 0.0, "epoch": 0.3811081185071181, "flos": 569007286272.0, "grad_norm": 0.0842263164190672, "language_loss": 0.85342598, "learning_rate": 0.0007101005373468908, "loss": 0.86437213, "num_input_tokens_seen": 164780320, "router_z_loss_mlp": 0.1184082, "routerloss_mlp": 0.0, "step": 1981, "time_per_iteration": 2.889251708984375 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097444, "balance_loss_mlp": 1.08543372, "diversity_loss_mlp": 0.0, "epoch": 0.3813005001923817, "flos": 584837895168.0, "grad_norm": 0.06048237516575629, "language_loss": 0.86649287, "learning_rate": 0.0007098177934248242, "loss": 0.87746727, "num_input_tokens_seen": 164854400, "router_z_loss_mlp": 0.12011719, "routerloss_mlp": 0.0, "step": 1982, "time_per_iteration": 2.773146867752075 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00920145, "balance_loss_mlp": 1.60273147, "diversity_loss_mlp": 0.20649332, "epoch": 0.38149288187764524, "flos": 621591335424.0, "grad_norm": 0.033525346661278974, "language_loss": 0.85516387, "learning_rate": 0.0007095349680442661, "loss": 0.86436534, "num_input_tokens_seen": 164932896, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01553278, "step": 1983, "time_per_iteration": 2.8675785064697266 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01116298, "balance_loss_mlp": 1.1045742, "diversity_loss_mlp": 0.0, "epoch": 0.3816852635629088, "flos": 570690012672.0, "grad_norm": 0.06407324010727367, "language_loss": 0.78783178, "learning_rate": 0.0007092520613150188, "loss": 0.79899484, "num_input_tokens_seen": 165002896, "router_z_loss_mlp": 0.1171875, "routerloss_mlp": 0.0, "step": 1984, "time_per_iteration": 2.709177017211914 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00918651, "balance_loss_mlp": 1.59999418, "diversity_loss_mlp": 0.20665541, "epoch": 0.38187764524817236, "flos": 565585307136.0, "grad_norm": 0.03070680845617011, "language_loss": 0.80925471, "learning_rate": 0.0007089690733469165, "loss": 0.81844121, "num_input_tokens_seen": 165074704, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01532666, "step": 1985, "time_per_iteration": 2.750558376312256 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135734, "balance_loss_mlp": 1.12384343, "diversity_loss_mlp": 0.0, "epoch": 0.38207002693343595, "flos": 631225156608.0, "grad_norm": 0.08571071539105668, "language_loss": 0.82313848, "learning_rate": 0.000708686004249825, "loss": 0.83449578, "num_input_tokens_seen": 165149136, "router_z_loss_mlp": 0.11889648, "routerloss_mlp": 0.0, "step": 1986, "time_per_iteration": 2.7550368309020996 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01132102, "balance_loss_mlp": 1.12012124, "diversity_loss_mlp": 0.0, "epoch": 0.3822624086186995, "flos": 548773843968.0, "grad_norm": 0.07744479108461458, "language_loss": 0.91340905, "learning_rate": 0.0007084028541336413, "loss": 0.92473006, "num_input_tokens_seen": 165220864, "router_z_loss_mlp": 0.11975098, "routerloss_mlp": 0.0, "step": 1987, "time_per_iteration": 2.703339099884033 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00914957, "balance_loss_mlp": 1.59260678, "diversity_loss_mlp": 0.20690078, "epoch": 0.38245479030396307, "flos": 613870036992.0, "grad_norm": 0.03035395776464378, "language_loss": 0.86267084, "learning_rate": 0.0007081196231082942, "loss": 0.87182039, "num_input_tokens_seen": 165301568, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01520337, "step": 1988, "time_per_iteration": 2.8075153827667236 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117714, "balance_loss_mlp": 1.10567343, "diversity_loss_mlp": 0.0, "epoch": 0.38264717198922665, "flos": 668089824768.0, "grad_norm": 0.07746710731409655, "language_loss": 0.80053389, "learning_rate": 0.0007078363112837436, "loss": 0.81171107, "num_input_tokens_seen": 165373152, "router_z_loss_mlp": 0.12036133, "routerloss_mlp": 0.0, "step": 1989, "time_per_iteration": 2.811197280883789 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01104881, "balance_loss_mlp": 1.09261441, "diversity_loss_mlp": 0.0, "epoch": 0.3828395536744902, "flos": 454754165760.0, "grad_norm": 0.07961201652041947, "language_loss": 0.84721339, "learning_rate": 0.000707552918769981, "loss": 0.85826218, "num_input_tokens_seen": 165439136, "router_z_loss_mlp": 0.12261963, "routerloss_mlp": 0.0, "step": 1990, "time_per_iteration": 2.4908246994018555 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01102216, "balance_loss_mlp": 1.08987188, "diversity_loss_mlp": 0.0, "epoch": 0.3830319353597538, "flos": 499448788992.0, "grad_norm": 0.06284554422997896, "language_loss": 0.83619118, "learning_rate": 0.000707269445677029, "loss": 0.84721333, "num_input_tokens_seen": 165514624, "router_z_loss_mlp": 0.12341309, "routerloss_mlp": 0.0, "step": 1991, "time_per_iteration": 2.733126401901245 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101588, "balance_loss_mlp": 1.08921361, "diversity_loss_mlp": 0.0, "epoch": 0.3832243170450173, "flos": 744121893888.0, "grad_norm": 0.07203164936975576, "language_loss": 0.85140717, "learning_rate": 0.0007069858921149416, "loss": 0.86242306, "num_input_tokens_seen": 165594512, "router_z_loss_mlp": 0.12371826, "routerloss_mlp": 0.0, "step": 1992, "time_per_iteration": 2.9382007122039795 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01096798, "balance_loss_mlp": 1.08434701, "diversity_loss_mlp": 0.0, "epoch": 0.3834166987302809, "flos": 578218590720.0, "grad_norm": 0.05485930037569587, "language_loss": 0.85794246, "learning_rate": 0.0007067022581938043, "loss": 0.86891043, "num_input_tokens_seen": 165673968, "router_z_loss_mlp": 0.12457275, "routerloss_mlp": 0.0, "step": 1993, "time_per_iteration": 2.857525110244751 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095406, "balance_loss_mlp": 1.08321714, "diversity_loss_mlp": 0.0, "epoch": 0.3836090804155444, "flos": 536476442112.0, "grad_norm": 0.0871408980162776, "language_loss": 0.83722532, "learning_rate": 0.0007064185440237334, "loss": 0.8481794, "num_input_tokens_seen": 165747664, "router_z_loss_mlp": 0.12188721, "routerloss_mlp": 0.0, "step": 1994, "time_per_iteration": 2.7131123542785645 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099231, "balance_loss_mlp": 1.08733368, "diversity_loss_mlp": 0.0, "epoch": 0.383801462100808, "flos": 601879154688.0, "grad_norm": 0.06357294591464056, "language_loss": 0.84358412, "learning_rate": 0.0007061347497148764, "loss": 0.85457647, "num_input_tokens_seen": 165824624, "router_z_loss_mlp": 0.11895752, "routerloss_mlp": 0.0, "step": 1995, "time_per_iteration": 2.7398569583892822 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01102339, "balance_loss_mlp": 1.09015, "diversity_loss_mlp": 0.0, "epoch": 0.38399384378607154, "flos": 572701280256.0, "grad_norm": 0.07322887134464046, "language_loss": 0.86299884, "learning_rate": 0.0007058508753774122, "loss": 0.87402225, "num_input_tokens_seen": 165896304, "router_z_loss_mlp": 0.12188721, "routerloss_mlp": 0.0, "step": 1996, "time_per_iteration": 2.6903162002563477 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108867, "balance_loss_mlp": 1.09709477, "diversity_loss_mlp": 0.0, "epoch": 0.38418622547133513, "flos": 536765709312.0, "grad_norm": 0.0698381422429368, "language_loss": 0.86921895, "learning_rate": 0.0007055669211215505, "loss": 0.88030767, "num_input_tokens_seen": 165961312, "router_z_loss_mlp": 0.11767578, "routerloss_mlp": 0.0, "step": 1997, "time_per_iteration": 2.695028066635132 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113141, "balance_loss_mlp": 1.10084486, "diversity_loss_mlp": 0.0, "epoch": 0.3843786071565987, "flos": 572940988416.0, "grad_norm": 0.08585182349688475, "language_loss": 0.77776283, "learning_rate": 0.0007052828870575322, "loss": 0.78889418, "num_input_tokens_seen": 166028064, "router_z_loss_mlp": 0.12298584, "routerloss_mlp": 0.0, "step": 1998, "time_per_iteration": 2.685685873031616 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.011259, "balance_loss_mlp": 1.11406291, "diversity_loss_mlp": 0.0, "epoch": 0.38457098884186225, "flos": 728703889920.0, "grad_norm": 0.06979871165732322, "language_loss": 0.87060714, "learning_rate": 0.0007049987732956291, "loss": 0.8818661, "num_input_tokens_seen": 166110272, "router_z_loss_mlp": 0.11834717, "routerloss_mlp": 0.0, "step": 1999, "time_per_iteration": 2.9710631370544434 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110647, "balance_loss_mlp": 1.09428668, "diversity_loss_mlp": 0.0, "epoch": 0.38476337052712584, "flos": 583422041088.0, "grad_norm": 0.05561177596637214, "language_loss": 0.82812738, "learning_rate": 0.0007047145799461439, "loss": 0.83919203, "num_input_tokens_seen": 166193088, "router_z_loss_mlp": 0.12176514, "routerloss_mlp": 0.0, "step": 2000, "time_per_iteration": 2.8492860794067383 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105216, "balance_loss_mlp": 1.09293747, "diversity_loss_mlp": 0.0, "epoch": 0.38495575221238937, "flos": 553060680192.0, "grad_norm": 0.06017266002852966, "language_loss": 0.82272708, "learning_rate": 0.00070443030711941, "loss": 0.83377922, "num_input_tokens_seen": 166271776, "router_z_loss_mlp": 0.1227417, "routerloss_mlp": 0.0, "step": 2001, "time_per_iteration": 2.769383430480957 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100076, "balance_loss_mlp": 1.08806002, "diversity_loss_mlp": 0.0, "epoch": 0.38514813389765296, "flos": 654473115648.0, "grad_norm": 0.061888534691205976, "language_loss": 0.82098496, "learning_rate": 0.0007041459549257924, "loss": 0.83198571, "num_input_tokens_seen": 166350000, "router_z_loss_mlp": 0.12011719, "routerloss_mlp": 0.0, "step": 2002, "time_per_iteration": 2.876244306564331 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089803, "balance_loss_mlp": 1.07744145, "diversity_loss_mlp": 0.0, "epoch": 0.3853405155829165, "flos": 868100239872.0, "grad_norm": 0.06816771124006925, "language_loss": 0.78024125, "learning_rate": 0.0007038615234756859, "loss": 0.79113925, "num_input_tokens_seen": 166434336, "router_z_loss_mlp": 0.12359619, "routerloss_mlp": 0.0, "step": 2003, "time_per_iteration": 3.1744768619537354 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01086899, "balance_loss_mlp": 1.07477546, "diversity_loss_mlp": 0.0, "epoch": 0.3855328972681801, "flos": 546424123392.0, "grad_norm": 0.09233530116269285, "language_loss": 0.83808231, "learning_rate": 0.000703577012879517, "loss": 0.84895122, "num_input_tokens_seen": 166503952, "router_z_loss_mlp": 0.12115479, "routerloss_mlp": 0.0, "step": 2004, "time_per_iteration": 2.633391857147217 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089004, "balance_loss_mlp": 1.07705307, "diversity_loss_mlp": 0.0, "epoch": 0.3857252789534436, "flos": 534074964480.0, "grad_norm": 0.07105955558417659, "language_loss": 0.88946962, "learning_rate": 0.0007032924232477423, "loss": 0.90035963, "num_input_tokens_seen": 166575168, "router_z_loss_mlp": 0.11950684, "routerloss_mlp": 0.0, "step": 2005, "time_per_iteration": 2.6482574939727783 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109168, "balance_loss_mlp": 1.0797528, "diversity_loss_mlp": 0.0, "epoch": 0.3859176606387072, "flos": 491764566528.0, "grad_norm": 0.07024694433071269, "language_loss": 0.80605727, "learning_rate": 0.0007030077546908493, "loss": 0.81697416, "num_input_tokens_seen": 166647552, "router_z_loss_mlp": 0.1192627, "routerloss_mlp": 0.0, "step": 2006, "time_per_iteration": 2.6219046115875244 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087051, "balance_loss_mlp": 1.08056581, "diversity_loss_mlp": 0.0, "epoch": 0.3861100423239708, "flos": 1487052214272.0, "grad_norm": 0.032453276732354666, "language_loss": 0.83064663, "learning_rate": 0.0007027230073193561, "loss": 0.84151709, "num_input_tokens_seen": 166875088, "router_z_loss_mlp": 0.06494141, "routerloss_mlp": 0.0, "step": 2007, "time_per_iteration": 4.798014402389526 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099136, "balance_loss_mlp": 1.08744717, "diversity_loss_mlp": 0.0, "epoch": 0.3863024240092343, "flos": 473732540928.0, "grad_norm": 0.08661380313869275, "language_loss": 0.79137146, "learning_rate": 0.0007024381812438117, "loss": 0.8023628, "num_input_tokens_seen": 166939344, "router_z_loss_mlp": 0.11676025, "routerloss_mlp": 0.0, "step": 2008, "time_per_iteration": 2.5403189659118652 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110095, "balance_loss_mlp": 1.08864713, "diversity_loss_mlp": 0.0, "epoch": 0.3864948056944979, "flos": 716601779712.0, "grad_norm": 0.09407170185597404, "language_loss": 0.83448064, "learning_rate": 0.0007021532765747951, "loss": 0.8454901, "num_input_tokens_seen": 167014992, "router_z_loss_mlp": 0.12310791, "routerloss_mlp": 0.0, "step": 2009, "time_per_iteration": 2.9585187435150146 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094144, "balance_loss_mlp": 1.08211613, "diversity_loss_mlp": 0.0, "epoch": 0.38668718737976143, "flos": 727631631360.0, "grad_norm": 0.0684890586406507, "language_loss": 0.79048979, "learning_rate": 0.0007018682934229162, "loss": 0.80143124, "num_input_tokens_seen": 167092096, "router_z_loss_mlp": 0.12017822, "routerloss_mlp": 0.0, "step": 2010, "time_per_iteration": 2.9703307151794434 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01096489, "balance_loss_mlp": 1.0842756, "diversity_loss_mlp": 0.0, "epoch": 0.386879569065025, "flos": 525471556608.0, "grad_norm": 0.06303649013837292, "language_loss": 0.82761061, "learning_rate": 0.0007015832318988152, "loss": 0.83857542, "num_input_tokens_seen": 167162144, "router_z_loss_mlp": 0.12200928, "routerloss_mlp": 0.0, "step": 2011, "time_per_iteration": 2.6060009002685547 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0102794, "balance_loss_mlp": 1.02231336, "diversity_loss_mlp": 0.0, "epoch": 0.38707195075028855, "flos": 1527771663360.0, "grad_norm": 0.017766506591404385, "language_loss": 0.73890078, "learning_rate": 0.000701298092113163, "loss": 0.7491802, "num_input_tokens_seen": 167391536, "router_z_loss_mlp": 0.05615234, "routerloss_mlp": 0.0, "step": 2012, "time_per_iteration": 4.938155651092529 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109088, "balance_loss_mlp": 1.07810068, "diversity_loss_mlp": 0.0, "epoch": 0.38726433243555214, "flos": 557313011712.0, "grad_norm": 0.060967443696148906, "language_loss": 0.84265292, "learning_rate": 0.0007010128741766604, "loss": 0.85356176, "num_input_tokens_seen": 167466000, "router_z_loss_mlp": 0.12792969, "routerloss_mlp": 0.0, "step": 2013, "time_per_iteration": 2.7293431758880615 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091499, "balance_loss_mlp": 1.07861209, "diversity_loss_mlp": 0.0, "epoch": 0.38745671412081567, "flos": 553695740928.0, "grad_norm": 0.07873148114105366, "language_loss": 0.84277219, "learning_rate": 0.0007007275782000391, "loss": 0.85368717, "num_input_tokens_seen": 167536144, "router_z_loss_mlp": 0.12896729, "routerloss_mlp": 0.0, "step": 2014, "time_per_iteration": 2.644911766052246 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091625, "balance_loss_mlp": 1.07889354, "diversity_loss_mlp": 0.0, "epoch": 0.38764909580607926, "flos": 458408512512.0, "grad_norm": 0.0868083489465314, "language_loss": 0.8502394, "learning_rate": 0.0007004422042940605, "loss": 0.86115563, "num_input_tokens_seen": 167600064, "router_z_loss_mlp": 0.12744141, "routerloss_mlp": 0.0, "step": 2015, "time_per_iteration": 2.5096747875213623 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109894, "balance_loss_mlp": 1.08593392, "diversity_loss_mlp": 0.0, "epoch": 0.38784147749134285, "flos": 522229814784.0, "grad_norm": 0.08227522563153689, "language_loss": 0.89877218, "learning_rate": 0.0007001567525695169, "loss": 0.90976155, "num_input_tokens_seen": 167666576, "router_z_loss_mlp": 0.13012695, "routerloss_mlp": 0.0, "step": 2016, "time_per_iteration": 2.606520891189575 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105972, "balance_loss_mlp": 1.09330583, "diversity_loss_mlp": 0.0, "epoch": 0.3880338591766064, "flos": 666036338688.0, "grad_norm": 0.06437704205290017, "language_loss": 0.83705699, "learning_rate": 0.0006998712231372303, "loss": 0.84811676, "num_input_tokens_seen": 167753296, "router_z_loss_mlp": 0.12670898, "routerloss_mlp": 0.0, "step": 2017, "time_per_iteration": 3.016061305999756 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119415, "balance_loss_mlp": 1.10692167, "diversity_loss_mlp": 0.0, "epoch": 0.38822624086186996, "flos": 593962564608.0, "grad_norm": 0.06622760195410109, "language_loss": 0.85886908, "learning_rate": 0.0006995856161080532, "loss": 0.87006325, "num_input_tokens_seen": 167834080, "router_z_loss_mlp": 0.12487793, "routerloss_mlp": 0.0, "step": 2018, "time_per_iteration": 2.8263893127441406 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124253, "balance_loss_mlp": 1.11165869, "diversity_loss_mlp": 0.0, "epoch": 0.3884186225471335, "flos": 612540817920.0, "grad_norm": 0.06957079313074316, "language_loss": 0.82328916, "learning_rate": 0.0006992999315928679, "loss": 0.83453172, "num_input_tokens_seen": 167912368, "router_z_loss_mlp": 0.1260376, "routerloss_mlp": 0.0, "step": 2019, "time_per_iteration": 2.789020299911499 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130626, "balance_loss_mlp": 1.11772799, "diversity_loss_mlp": 0.0, "epoch": 0.3886110042323971, "flos": 607038188544.0, "grad_norm": 0.05589846380959986, "language_loss": 0.85480869, "learning_rate": 0.0006990141697025871, "loss": 0.86611497, "num_input_tokens_seen": 167991968, "router_z_loss_mlp": 0.12915039, "routerloss_mlp": 0.0, "step": 2020, "time_per_iteration": 2.788597345352173 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067569, "balance_loss_mlp": 1.06141829, "diversity_loss_mlp": 0.0, "epoch": 0.3888033859176606, "flos": 1528067897856.0, "grad_norm": 0.034323999481440985, "language_loss": 0.76359642, "learning_rate": 0.0006987283305481533, "loss": 0.77427208, "num_input_tokens_seen": 168212128, "router_z_loss_mlp": 0.06152344, "routerloss_mlp": 0.0, "step": 2021, "time_per_iteration": 4.782108545303345 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130924, "balance_loss_mlp": 1.11879468, "diversity_loss_mlp": 0.0, "epoch": 0.3889957676029242, "flos": 692449689600.0, "grad_norm": 0.0813348018947899, "language_loss": 0.82333553, "learning_rate": 0.0006984424142405392, "loss": 0.83464473, "num_input_tokens_seen": 168287440, "router_z_loss_mlp": 0.12127686, "routerloss_mlp": 0.0, "step": 2022, "time_per_iteration": 2.804866075515747 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118053, "balance_loss_mlp": 1.10578668, "diversity_loss_mlp": 0.0, "epoch": 0.3891881492881878, "flos": 515187993600.0, "grad_norm": 0.07379903296161248, "language_loss": 0.82117045, "learning_rate": 0.0006981564208907474, "loss": 0.83235097, "num_input_tokens_seen": 168354704, "router_z_loss_mlp": 0.12261963, "routerloss_mlp": 0.0, "step": 2023, "time_per_iteration": 2.5883662700653076 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130828, "balance_loss_mlp": 1.11855519, "diversity_loss_mlp": 0.0, "epoch": 0.3893805309734513, "flos": 629050904064.0, "grad_norm": 0.07869766022149485, "language_loss": 0.8995713, "learning_rate": 0.0006978703506098102, "loss": 0.91087961, "num_input_tokens_seen": 168424272, "router_z_loss_mlp": 0.12280273, "routerloss_mlp": 0.0, "step": 2024, "time_per_iteration": 2.730283498764038 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01127449, "balance_loss_mlp": 1.11556411, "diversity_loss_mlp": 0.0, "epoch": 0.3895729126587149, "flos": 544155895296.0, "grad_norm": 0.0665173530375796, "language_loss": 0.88210815, "learning_rate": 0.00069758420350879, "loss": 0.89338267, "num_input_tokens_seen": 168488912, "router_z_loss_mlp": 0.11871338, "routerloss_mlp": 0.0, "step": 2025, "time_per_iteration": 2.62969708442688 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00932402, "balance_loss_mlp": 1.62686133, "diversity_loss_mlp": 0.20693868, "epoch": 0.38976529434397844, "flos": 618270672384.0, "grad_norm": 0.03379762859523427, "language_loss": 0.8613863, "learning_rate": 0.000697297979698779, "loss": 0.87071025, "num_input_tokens_seen": 168563248, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01550185, "step": 2026, "time_per_iteration": 2.837543249130249 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01107077, "balance_loss_mlp": 1.09529877, "diversity_loss_mlp": 0.0, "epoch": 0.38995767602924203, "flos": 834882577920.0, "grad_norm": 0.06049708379655892, "language_loss": 0.83660531, "learning_rate": 0.0006970116792908992, "loss": 0.84767604, "num_input_tokens_seen": 168648272, "router_z_loss_mlp": 0.11767578, "routerloss_mlp": 0.0, "step": 2027, "time_per_iteration": 3.1133604049682617 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01107629, "balance_loss_mlp": 1.0960542, "diversity_loss_mlp": 0.0, "epoch": 0.39015005771450556, "flos": 541603542528.0, "grad_norm": 0.07190738956644391, "language_loss": 0.81380564, "learning_rate": 0.000696725302396302, "loss": 0.82488191, "num_input_tokens_seen": 168721760, "router_z_loss_mlp": 0.11566162, "routerloss_mlp": 0.0, "step": 2028, "time_per_iteration": 2.6460230350494385 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109766, "balance_loss_mlp": 1.08604932, "diversity_loss_mlp": 0.0, "epoch": 0.39034243939976915, "flos": 1007509072896.0, "grad_norm": 0.06814290150602269, "language_loss": 0.85887402, "learning_rate": 0.0006964388491261692, "loss": 0.86985064, "num_input_tokens_seen": 168803664, "router_z_loss_mlp": 0.1159668, "routerloss_mlp": 0.0, "step": 2029, "time_per_iteration": 3.296208143234253 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099195, "balance_loss_mlp": 1.0871129, "diversity_loss_mlp": 0.0, "epoch": 0.3905348210850327, "flos": 679025700864.0, "grad_norm": 0.075812953715104, "language_loss": 0.87511015, "learning_rate": 0.0006961523195917114, "loss": 0.88610214, "num_input_tokens_seen": 168879184, "router_z_loss_mlp": 0.12084961, "routerloss_mlp": 0.0, "step": 2030, "time_per_iteration": 2.803239345550537 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01107522, "balance_loss_mlp": 1.09573865, "diversity_loss_mlp": 0.0, "epoch": 0.39072720277029627, "flos": 548882500608.0, "grad_norm": 0.0665807006884719, "language_loss": 0.78137511, "learning_rate": 0.0006958657139041696, "loss": 0.79245031, "num_input_tokens_seen": 168957808, "router_z_loss_mlp": 0.11773682, "routerloss_mlp": 0.0, "step": 2031, "time_per_iteration": 2.739151954650879 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061018, "balance_loss_mlp": 1.05531955, "diversity_loss_mlp": 0.0, "epoch": 0.39091958445555985, "flos": 1547737860096.0, "grad_norm": 0.035996309550900246, "language_loss": 0.76712966, "learning_rate": 0.0006955790321748136, "loss": 0.77773988, "num_input_tokens_seen": 169194416, "router_z_loss_mlp": 0.05688477, "routerloss_mlp": 0.0, "step": 2032, "time_per_iteration": 4.918209552764893 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094882, "balance_loss_mlp": 1.08307993, "diversity_loss_mlp": 0.0, "epoch": 0.3911119661408234, "flos": 504002497536.0, "grad_norm": 0.0751880944680772, "language_loss": 0.78643966, "learning_rate": 0.0006952922745149434, "loss": 0.79738843, "num_input_tokens_seen": 169263552, "router_z_loss_mlp": 0.11791992, "routerloss_mlp": 0.0, "step": 2033, "time_per_iteration": 2.6274161338806152 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091358, "balance_loss_mlp": 1.07940745, "diversity_loss_mlp": 0.0, "epoch": 0.391304347826087, "flos": 557238859776.0, "grad_norm": 0.07391479182011068, "language_loss": 0.87674987, "learning_rate": 0.000695005441035888, "loss": 0.88766348, "num_input_tokens_seen": 169333696, "router_z_loss_mlp": 0.1194458, "routerloss_mlp": 0.0, "step": 2034, "time_per_iteration": 2.647348642349243 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01018577, "balance_loss_mlp": 1.01280713, "diversity_loss_mlp": 0.0, "epoch": 0.3914967295113505, "flos": 1500034235904.0, "grad_norm": 0.010435626825017296, "language_loss": 0.73723435, "learning_rate": 0.0006947185318490064, "loss": 0.74742007, "num_input_tokens_seen": 169556416, "router_z_loss_mlp": 0.05761719, "routerloss_mlp": 0.0, "step": 2035, "time_per_iteration": 4.8861188888549805 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01107172, "balance_loss_mlp": 1.094733, "diversity_loss_mlp": 0.0, "epoch": 0.3916891111966141, "flos": 707037341184.0, "grad_norm": 0.06114898183694146, "language_loss": 0.81133932, "learning_rate": 0.0006944315470656863, "loss": 0.82241106, "num_input_tokens_seen": 169643312, "router_z_loss_mlp": 0.12438965, "routerloss_mlp": 0.0, "step": 2036, "time_per_iteration": 3.0057246685028076 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108633, "balance_loss_mlp": 1.09606266, "diversity_loss_mlp": 0.0, "epoch": 0.3918814928818776, "flos": 556349409792.0, "grad_norm": 0.0812142536963638, "language_loss": 0.90953541, "learning_rate": 0.000694144486797345, "loss": 0.92062169, "num_input_tokens_seen": 169712560, "router_z_loss_mlp": 0.12579346, "routerloss_mlp": 0.0, "step": 2037, "time_per_iteration": 2.6566872596740723 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01012054, "balance_loss_mlp": 1.0060699, "diversity_loss_mlp": 0.0, "epoch": 0.3920738745671412, "flos": 1538610992640.0, "grad_norm": 0.012879447335335118, "language_loss": 0.79520434, "learning_rate": 0.0006938573511554296, "loss": 0.80532491, "num_input_tokens_seen": 169914912, "router_z_loss_mlp": 0.05981445, "routerloss_mlp": 0.0, "step": 2038, "time_per_iteration": 4.609802722930908 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103729, "balance_loss_mlp": 1.09141517, "diversity_loss_mlp": 0.0, "epoch": 0.39226625625240474, "flos": 498836123136.0, "grad_norm": 0.07718413790316761, "language_loss": 0.89271998, "learning_rate": 0.0006935701402514156, "loss": 0.90375727, "num_input_tokens_seen": 169978848, "router_z_loss_mlp": 0.12316895, "routerloss_mlp": 0.0, "step": 2039, "time_per_iteration": 2.610905408859253 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0101805, "balance_loss_mlp": 1.01206541, "diversity_loss_mlp": 0.0, "epoch": 0.39245863793766833, "flos": 1347260138496.0, "grad_norm": 0.016017309503016164, "language_loss": 0.73034894, "learning_rate": 0.0006932828541968083, "loss": 0.74052942, "num_input_tokens_seen": 170211488, "router_z_loss_mlp": 0.05981445, "routerloss_mlp": 0.0, "step": 2040, "time_per_iteration": 4.954579830169678 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106674, "balance_loss_mlp": 1.09434199, "diversity_loss_mlp": 0.0, "epoch": 0.3926510196229319, "flos": 1346113022976.0, "grad_norm": 0.0728619475730698, "language_loss": 0.84539711, "learning_rate": 0.0006929954931031422, "loss": 0.85646391, "num_input_tokens_seen": 170298528, "router_z_loss_mlp": 0.12329102, "routerloss_mlp": 0.0, "step": 2041, "time_per_iteration": 3.6979990005493164 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01114297, "balance_loss_mlp": 1.10201287, "diversity_loss_mlp": 0.0, "epoch": 0.39284340130819545, "flos": 499587181056.0, "grad_norm": 0.07303574322286652, "language_loss": 0.88330269, "learning_rate": 0.0006927080570819805, "loss": 0.89444566, "num_input_tokens_seen": 170365680, "router_z_loss_mlp": 0.12280273, "routerloss_mlp": 0.0, "step": 2042, "time_per_iteration": 2.5840306282043457 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126631, "balance_loss_mlp": 1.11437607, "diversity_loss_mlp": 0.0, "epoch": 0.39303578299345904, "flos": 520329775104.0, "grad_norm": 0.09784101638347129, "language_loss": 0.80726093, "learning_rate": 0.0006924205462449161, "loss": 0.81852722, "num_input_tokens_seen": 170432224, "router_z_loss_mlp": 0.12255859, "routerloss_mlp": 0.0, "step": 2043, "time_per_iteration": 2.556964159011841 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01123407, "balance_loss_mlp": 1.11139631, "diversity_loss_mlp": 0.0, "epoch": 0.39322816467872257, "flos": 907929865728.0, "grad_norm": 0.07674510212981295, "language_loss": 0.81822228, "learning_rate": 0.0006921329607035702, "loss": 0.82945639, "num_input_tokens_seen": 170517920, "router_z_loss_mlp": 0.12005615, "routerloss_mlp": 0.0, "step": 2044, "time_per_iteration": 3.2355051040649414 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01109464, "balance_loss_mlp": 1.09777582, "diversity_loss_mlp": 0.0, "epoch": 0.39342054636398616, "flos": 517592042496.0, "grad_norm": 0.0626655505852987, "language_loss": 0.87889385, "learning_rate": 0.0006918453005695938, "loss": 0.88998848, "num_input_tokens_seen": 170589072, "router_z_loss_mlp": 0.11682129, "routerloss_mlp": 0.0, "step": 2045, "time_per_iteration": 2.616405725479126 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112555, "balance_loss_mlp": 1.10047281, "diversity_loss_mlp": 0.0, "epoch": 0.3936129280492497, "flos": 547918898688.0, "grad_norm": 0.0593607382511463, "language_loss": 0.8430419, "learning_rate": 0.0006915575659546662, "loss": 0.85416746, "num_input_tokens_seen": 170657856, "router_z_loss_mlp": 0.12078857, "routerloss_mlp": 0.0, "step": 2046, "time_per_iteration": 2.6596429347991943 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100096, "balance_loss_mlp": 1.08785915, "diversity_loss_mlp": 0.0, "epoch": 0.3938053097345133, "flos": 526113957888.0, "grad_norm": 0.0680979304239865, "language_loss": 0.80745959, "learning_rate": 0.0006912697569704959, "loss": 0.81846058, "num_input_tokens_seen": 170723696, "router_z_loss_mlp": 0.12231445, "routerloss_mlp": 0.0, "step": 2047, "time_per_iteration": 2.5962154865264893 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097855, "balance_loss_mlp": 1.08564174, "diversity_loss_mlp": 0.0, "epoch": 0.39399769141977686, "flos": 471629869056.0, "grad_norm": 0.07634449995136075, "language_loss": 0.8702817, "learning_rate": 0.0006909818737288205, "loss": 0.88126016, "num_input_tokens_seen": 170789536, "router_z_loss_mlp": 0.12207031, "routerloss_mlp": 0.0, "step": 2048, "time_per_iteration": 2.5559332370758057 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111282, "balance_loss_mlp": 1.09955215, "diversity_loss_mlp": 0.0, "epoch": 0.3941900731050404, "flos": 501736840704.0, "grad_norm": 0.07451514550279957, "language_loss": 0.80715293, "learning_rate": 0.000690693916341406, "loss": 0.81826574, "num_input_tokens_seen": 170859232, "router_z_loss_mlp": 0.11724854, "routerloss_mlp": 0.0, "step": 2049, "time_per_iteration": 2.605881690979004 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115507, "balance_loss_mlp": 1.10377121, "diversity_loss_mlp": 0.0, "epoch": 0.394382454790304, "flos": 580862347776.0, "grad_norm": 0.06516266173427393, "language_loss": 0.82286257, "learning_rate": 0.0006904058849200475, "loss": 0.83401763, "num_input_tokens_seen": 170931568, "router_z_loss_mlp": 0.11724854, "routerloss_mlp": 0.0, "step": 2050, "time_per_iteration": 2.7183115482330322 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105643, "balance_loss_mlp": 1.09360313, "diversity_loss_mlp": 0.0, "epoch": 0.3945748364755675, "flos": 513819127296.0, "grad_norm": 0.0753850450331705, "language_loss": 0.84972727, "learning_rate": 0.0006901177795765683, "loss": 0.8607837, "num_input_tokens_seen": 170999856, "router_z_loss_mlp": 0.12042236, "routerloss_mlp": 0.0, "step": 2051, "time_per_iteration": 2.627774715423584 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105318, "balance_loss_mlp": 1.09354019, "diversity_loss_mlp": 0.0, "epoch": 0.3947672181608311, "flos": 593957795328.0, "grad_norm": 0.06465732667856934, "language_loss": 0.81096435, "learning_rate": 0.0006898296004228213, "loss": 0.82201755, "num_input_tokens_seen": 171072320, "router_z_loss_mlp": 0.11773682, "routerloss_mlp": 0.0, "step": 2052, "time_per_iteration": 2.7607421875 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050397, "balance_loss_mlp": 1.04446077, "diversity_loss_mlp": 0.0, "epoch": 0.39495959984609463, "flos": 1547671048704.0, "grad_norm": 0.03031396698302257, "language_loss": 0.7812674, "learning_rate": 0.0006895413475706873, "loss": 0.79177135, "num_input_tokens_seen": 171304128, "router_z_loss_mlp": 0.05932617, "routerloss_mlp": 0.0, "step": 2053, "time_per_iteration": 4.876460552215576 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117073, "balance_loss_mlp": 1.10529494, "diversity_loss_mlp": 0.0, "epoch": 0.3951519815313582, "flos": 496520907264.0, "grad_norm": 0.1105412420488248, "language_loss": 0.79620701, "learning_rate": 0.0006892530211320763, "loss": 0.80737776, "num_input_tokens_seen": 171377392, "router_z_loss_mlp": 0.11767578, "routerloss_mlp": 0.0, "step": 2054, "time_per_iteration": 2.702591896057129 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00944261, "balance_loss_mlp": 1.6481061, "diversity_loss_mlp": 0.21043469, "epoch": 0.39534436321662175, "flos": 531191499264.0, "grad_norm": 0.03587460904718008, "language_loss": 0.84313488, "learning_rate": 0.000688964621218926, "loss": 0.85257751, "num_input_tokens_seen": 171447424, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01499031, "step": 2055, "time_per_iteration": 2.6392524242401123 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109262, "balance_loss_mlp": 1.08063984, "diversity_loss_mlp": 0.0, "epoch": 0.39553674490188534, "flos": 702523279872.0, "grad_norm": 0.0862390851468888, "language_loss": 0.80478442, "learning_rate": 0.0006886761479432037, "loss": 0.81571066, "num_input_tokens_seen": 171519920, "router_z_loss_mlp": 0.11968994, "routerloss_mlp": 0.0, "step": 2056, "time_per_iteration": 2.8577234745025635 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079958, "balance_loss_mlp": 1.06739902, "diversity_loss_mlp": 0.0, "epoch": 0.3957291265871489, "flos": 409772846592.0, "grad_norm": 0.06874544900142358, "language_loss": 0.84387571, "learning_rate": 0.0006883876014169045, "loss": 0.85467529, "num_input_tokens_seen": 171583856, "router_z_loss_mlp": 0.12554932, "routerloss_mlp": 0.0, "step": 2057, "time_per_iteration": 2.572458505630493 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073975, "balance_loss_mlp": 1.06154716, "diversity_loss_mlp": 0.0, "epoch": 0.39592150827241246, "flos": 618490556928.0, "grad_norm": 0.07681071569739906, "language_loss": 0.90056652, "learning_rate": 0.000688098981752052, "loss": 0.91130626, "num_input_tokens_seen": 171656064, "router_z_loss_mlp": 0.12432861, "routerloss_mlp": 0.0, "step": 2058, "time_per_iteration": 2.7125563621520996 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080003, "balance_loss_mlp": 1.06697917, "diversity_loss_mlp": 0.0, "epoch": 0.39611388995767605, "flos": 821332680192.0, "grad_norm": 0.08571295812058347, "language_loss": 0.80176479, "learning_rate": 0.0006878102890606982, "loss": 0.81256485, "num_input_tokens_seen": 171738800, "router_z_loss_mlp": 0.13043213, "routerloss_mlp": 0.0, "step": 2059, "time_per_iteration": 3.0797197818756104 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108467, "balance_loss_mlp": 1.07161617, "diversity_loss_mlp": 0.0, "epoch": 0.3963062716429396, "flos": 492224159232.0, "grad_norm": 0.08415103615204221, "language_loss": 0.81576395, "learning_rate": 0.0006875215234549239, "loss": 0.82661068, "num_input_tokens_seen": 171803664, "router_z_loss_mlp": 0.1307373, "routerloss_mlp": 0.0, "step": 2060, "time_per_iteration": 2.5358171463012695 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078952, "balance_loss_mlp": 1.06604218, "diversity_loss_mlp": 0.0, "epoch": 0.39649865332820317, "flos": 584739150336.0, "grad_norm": 0.08360675720274492, "language_loss": 0.85212821, "learning_rate": 0.0006872326850468376, "loss": 0.86291778, "num_input_tokens_seen": 171871968, "router_z_loss_mlp": 0.12927246, "routerloss_mlp": 0.0, "step": 2061, "time_per_iteration": 2.685746669769287 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079022, "balance_loss_mlp": 1.06612396, "diversity_loss_mlp": 0.0, "epoch": 0.3966910350134667, "flos": 458564156928.0, "grad_norm": 0.08669948408116639, "language_loss": 0.78834969, "learning_rate": 0.0006869437739485762, "loss": 0.79913992, "num_input_tokens_seen": 171942368, "router_z_loss_mlp": 0.12908936, "routerloss_mlp": 0.0, "step": 2062, "time_per_iteration": 2.608938455581665 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085084, "balance_loss_mlp": 1.07266808, "diversity_loss_mlp": 0.0, "epoch": 0.3968834166987303, "flos": 508632929280.0, "grad_norm": 0.06314890183319057, "language_loss": 0.92750764, "learning_rate": 0.0006866547902723053, "loss": 0.93835843, "num_input_tokens_seen": 172012336, "router_z_loss_mlp": 0.12420654, "routerloss_mlp": 0.0, "step": 2063, "time_per_iteration": 2.654764175415039 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083714, "balance_loss_mlp": 1.07135844, "diversity_loss_mlp": 0.0, "epoch": 0.3970757983839938, "flos": 572627128320.0, "grad_norm": 0.10797740353372913, "language_loss": 0.80444092, "learning_rate": 0.000686365734130218, "loss": 0.81527805, "num_input_tokens_seen": 172084640, "router_z_loss_mlp": 0.12365723, "routerloss_mlp": 0.0, "step": 2064, "time_per_iteration": 2.7161076068878174 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085534, "balance_loss_mlp": 1.07345843, "diversity_loss_mlp": 0.0, "epoch": 0.3972681800692574, "flos": 481629307392.0, "grad_norm": 0.06605501724079509, "language_loss": 0.83883071, "learning_rate": 0.000686076605634536, "loss": 0.84968603, "num_input_tokens_seen": 172152992, "router_z_loss_mlp": 0.12084961, "routerloss_mlp": 0.0, "step": 2065, "time_per_iteration": 2.5960052013397217 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088286, "balance_loss_mlp": 1.07656133, "diversity_loss_mlp": 0.0, "epoch": 0.397460561754521, "flos": 487927411200.0, "grad_norm": 0.06893141882644385, "language_loss": 0.84303313, "learning_rate": 0.0006857874048975088, "loss": 0.85391599, "num_input_tokens_seen": 172219312, "router_z_loss_mlp": 0.11724854, "routerloss_mlp": 0.0, "step": 2066, "time_per_iteration": 2.5419557094573975 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098331, "balance_loss_mlp": 1.08599246, "diversity_loss_mlp": 0.0, "epoch": 0.3976529434397845, "flos": 421993525248.0, "grad_norm": 0.07076940729430262, "language_loss": 0.86944497, "learning_rate": 0.0006854981320314142, "loss": 0.88042831, "num_input_tokens_seen": 172282112, "router_z_loss_mlp": 0.12329102, "routerloss_mlp": 0.0, "step": 2067, "time_per_iteration": 2.4425127506256104 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101956, "balance_loss_mlp": 1.0900414, "diversity_loss_mlp": 0.0, "epoch": 0.3978453251250481, "flos": 545589001728.0, "grad_norm": 0.08678893766230582, "language_loss": 0.86775517, "learning_rate": 0.0006852087871485579, "loss": 0.87877476, "num_input_tokens_seen": 172347872, "router_z_loss_mlp": 0.11914062, "routerloss_mlp": 0.0, "step": 2068, "time_per_iteration": 2.617234468460083 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01104627, "balance_loss_mlp": 1.09308147, "diversity_loss_mlp": 0.0, "epoch": 0.39803770681031164, "flos": 650838592512.0, "grad_norm": 0.08540761893483814, "language_loss": 0.81805646, "learning_rate": 0.0006849193703612735, "loss": 0.82910275, "num_input_tokens_seen": 172418560, "router_z_loss_mlp": 0.11547852, "routerloss_mlp": 0.0, "step": 2069, "time_per_iteration": 2.7818312644958496 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110023, "balance_loss_mlp": 1.08808875, "diversity_loss_mlp": 0.0, "epoch": 0.39823008849557523, "flos": 740072194560.0, "grad_norm": 0.06305964525737012, "language_loss": 0.77731991, "learning_rate": 0.0006846298817819225, "loss": 0.78832221, "num_input_tokens_seen": 172497984, "router_z_loss_mlp": 0.12139893, "routerloss_mlp": 0.0, "step": 2070, "time_per_iteration": 2.970045328140259 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099656, "balance_loss_mlp": 1.08777106, "diversity_loss_mlp": 0.0, "epoch": 0.39842247018083876, "flos": 385037452800.0, "grad_norm": 0.09229213766989015, "language_loss": 0.81058359, "learning_rate": 0.0006843403215228945, "loss": 0.82158017, "num_input_tokens_seen": 172560112, "router_z_loss_mlp": 0.11871338, "routerloss_mlp": 0.0, "step": 2071, "time_per_iteration": 2.47542405128479 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097427, "balance_loss_mlp": 1.08525538, "diversity_loss_mlp": 0.0, "epoch": 0.39861485186610235, "flos": 533696864256.0, "grad_norm": 0.06250612449775428, "language_loss": 0.80665851, "learning_rate": 0.0006840506896966065, "loss": 0.81763273, "num_input_tokens_seen": 172636192, "router_z_loss_mlp": 0.12158203, "routerloss_mlp": 0.0, "step": 2072, "time_per_iteration": 2.7048730850219727 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01102151, "balance_loss_mlp": 1.09000397, "diversity_loss_mlp": 0.0, "epoch": 0.39880723355136594, "flos": 643149227520.0, "grad_norm": 0.07670911788950584, "language_loss": 0.82343054, "learning_rate": 0.0006837609864155038, "loss": 0.83445203, "num_input_tokens_seen": 172715264, "router_z_loss_mlp": 0.12139893, "routerloss_mlp": 0.0, "step": 2073, "time_per_iteration": 2.940208673477173 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111546, "balance_loss_mlp": 1.09976768, "diversity_loss_mlp": 0.0, "epoch": 0.39899961523662947, "flos": 515847647232.0, "grad_norm": 0.06443735331096001, "language_loss": 0.83203363, "learning_rate": 0.0006834712117920592, "loss": 0.84314907, "num_input_tokens_seen": 172783456, "router_z_loss_mlp": 0.11767578, "routerloss_mlp": 0.0, "step": 2074, "time_per_iteration": 2.6217153072357178 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111182, "balance_loss_mlp": 1.09892166, "diversity_loss_mlp": 0.0, "epoch": 0.39919199692189306, "flos": 464385415680.0, "grad_norm": 0.07401760730887977, "language_loss": 0.85670066, "learning_rate": 0.0006831813659387729, "loss": 0.86781245, "num_input_tokens_seen": 172848928, "router_z_loss_mlp": 0.12261963, "routerloss_mlp": 0.0, "step": 2075, "time_per_iteration": 2.5696237087249756 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01109855, "balance_loss_mlp": 1.09774292, "diversity_loss_mlp": 0.0, "epoch": 0.3993843786071566, "flos": 531641180160.0, "grad_norm": 0.05990934262108594, "language_loss": 0.84167391, "learning_rate": 0.0006828914489681733, "loss": 0.85277247, "num_input_tokens_seen": 172921152, "router_z_loss_mlp": 0.12109375, "routerloss_mlp": 0.0, "step": 2076, "time_per_iteration": 2.7859339714050293 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119025, "balance_loss_mlp": 1.1072948, "diversity_loss_mlp": 0.0, "epoch": 0.3995767602924202, "flos": 503965421568.0, "grad_norm": 0.06517456650976074, "language_loss": 0.85312855, "learning_rate": 0.0006826014609928162, "loss": 0.86431879, "num_input_tokens_seen": 172998864, "router_z_loss_mlp": 0.11724854, "routerloss_mlp": 0.0, "step": 2077, "time_per_iteration": 2.6851699352264404 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0060157, "balance_loss_mlp": 1.02597332, "diversity_loss_mlp": 0.1552759, "epoch": 0.3997691419776837, "flos": 1454516600832.0, "grad_norm": 0.0013651319096223075, "language_loss": 0.83199388, "learning_rate": 0.0006823114021252846, "loss": 0.8380096, "num_input_tokens_seen": 173219216, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01094547, "step": 2078, "time_per_iteration": 4.859188795089722 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01114581, "balance_loss_mlp": 1.1030947, "diversity_loss_mlp": 0.0, "epoch": 0.3999615236629473, "flos": 530684918784.0, "grad_norm": 0.0748648316539235, "language_loss": 0.80062771, "learning_rate": 0.0006820212724781896, "loss": 0.81177354, "num_input_tokens_seen": 173292000, "router_z_loss_mlp": 0.11486816, "routerloss_mlp": 0.0, "step": 2079, "time_per_iteration": 2.6628189086914062 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106499, "balance_loss_mlp": 1.09492946, "diversity_loss_mlp": 0.0, "epoch": 0.4001539053482108, "flos": 695130522624.0, "grad_norm": 0.06148312623903997, "language_loss": 0.83733618, "learning_rate": 0.0006817310721641694, "loss": 0.84840119, "num_input_tokens_seen": 173365568, "router_z_loss_mlp": 0.11566162, "routerloss_mlp": 0.0, "step": 2080, "time_per_iteration": 2.847182512283325 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119227, "balance_loss_mlp": 1.10731816, "diversity_loss_mlp": 0.0, "epoch": 0.4003462870334744, "flos": 520356939264.0, "grad_norm": 0.07223167054032475, "language_loss": 0.83566946, "learning_rate": 0.00068144080129589, "loss": 0.84686172, "num_input_tokens_seen": 173430144, "router_z_loss_mlp": 0.11907959, "routerloss_mlp": 0.0, "step": 2081, "time_per_iteration": 2.7161402702331543 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115616, "balance_loss_mlp": 1.10388541, "diversity_loss_mlp": 0.0, "epoch": 0.400538668718738, "flos": 492518195712.0, "grad_norm": 0.07619573858560975, "language_loss": 0.8280167, "learning_rate": 0.0006811504599860441, "loss": 0.83917284, "num_input_tokens_seen": 173494464, "router_z_loss_mlp": 0.11724854, "routerloss_mlp": 0.0, "step": 2082, "time_per_iteration": 2.5584774017333984 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01104984, "balance_loss_mlp": 1.0928719, "diversity_loss_mlp": 0.0, "epoch": 0.40073105040400153, "flos": 490356052992.0, "grad_norm": 0.1306421138400452, "language_loss": 0.8569895, "learning_rate": 0.0006808600483473526, "loss": 0.86803931, "num_input_tokens_seen": 173577168, "router_z_loss_mlp": 0.12109375, "routerloss_mlp": 0.0, "step": 2083, "time_per_iteration": 2.864786148071289 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094797, "balance_loss_mlp": 1.0824883, "diversity_loss_mlp": 0.0, "epoch": 0.4009234320892651, "flos": 562378070016.0, "grad_norm": 0.06339794743033755, "language_loss": 0.86393988, "learning_rate": 0.0006805695664925629, "loss": 0.87488782, "num_input_tokens_seen": 173655632, "router_z_loss_mlp": 0.12304688, "routerloss_mlp": 0.0, "step": 2084, "time_per_iteration": 2.844709634780884 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089155, "balance_loss_mlp": 1.07735372, "diversity_loss_mlp": 0.0, "epoch": 0.40111581377452865, "flos": 425998808064.0, "grad_norm": 0.0888076684038974, "language_loss": 0.83841193, "learning_rate": 0.0006802790145344506, "loss": 0.84930348, "num_input_tokens_seen": 173719040, "router_z_loss_mlp": 0.11791992, "routerloss_mlp": 0.0, "step": 2085, "time_per_iteration": 2.4883856773376465 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083386, "balance_loss_mlp": 1.07145894, "diversity_loss_mlp": 0.0, "epoch": 0.40130819545979224, "flos": 612446842368.0, "grad_norm": 0.07803386161895243, "language_loss": 0.87420845, "learning_rate": 0.0006799883925858176, "loss": 0.88504231, "num_input_tokens_seen": 173796704, "router_z_loss_mlp": 0.11914062, "routerloss_mlp": 0.0, "step": 2086, "time_per_iteration": 2.8824286460876465 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088787, "balance_loss_mlp": 1.0766871, "diversity_loss_mlp": 0.0, "epoch": 0.40150057714505577, "flos": 523433124864.0, "grad_norm": 0.06924310288687491, "language_loss": 0.85459089, "learning_rate": 0.0006796977007594933, "loss": 0.86547881, "num_input_tokens_seen": 173862352, "router_z_loss_mlp": 0.12091064, "routerloss_mlp": 0.0, "step": 2087, "time_per_iteration": 2.6597371101379395 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00970559, "balance_loss_mlp": 1.6983223, "diversity_loss_mlp": 0.21244028, "epoch": 0.40169295883031936, "flos": 561424379904.0, "grad_norm": 0.03280700890509502, "language_loss": 0.86715519, "learning_rate": 0.0006794069391683345, "loss": 0.87686074, "num_input_tokens_seen": 173935408, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01517779, "step": 2088, "time_per_iteration": 2.7649624347686768 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078316, "balance_loss_mlp": 1.06610286, "diversity_loss_mlp": 0.0, "epoch": 0.4018853405155829, "flos": 518997984768.0, "grad_norm": 0.07764554073270104, "language_loss": 0.80781567, "learning_rate": 0.0006791161079252248, "loss": 0.81859887, "num_input_tokens_seen": 174007152, "router_z_loss_mlp": 0.12213135, "routerloss_mlp": 0.0, "step": 2089, "time_per_iteration": 2.6467885971069336 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082706, "balance_loss_mlp": 1.07014716, "diversity_loss_mlp": 0.0, "epoch": 0.4020777222008465, "flos": 526222614528.0, "grad_norm": 0.0935978018434956, "language_loss": 0.82482743, "learning_rate": 0.0006788252071430747, "loss": 0.8356545, "num_input_tokens_seen": 174074976, "router_z_loss_mlp": 0.12561035, "routerloss_mlp": 0.0, "step": 2090, "time_per_iteration": 2.684659242630005 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076621, "balance_loss_mlp": 1.06417561, "diversity_loss_mlp": 0.0, "epoch": 0.40227010388611006, "flos": 525763021824.0, "grad_norm": 0.061003649340911806, "language_loss": 0.86884034, "learning_rate": 0.0006785342369348222, "loss": 0.87960654, "num_input_tokens_seen": 174149392, "router_z_loss_mlp": 0.12451172, "routerloss_mlp": 0.0, "step": 2091, "time_per_iteration": 2.7500762939453125 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081239, "balance_loss_mlp": 1.06896663, "diversity_loss_mlp": 0.0, "epoch": 0.4024624855713736, "flos": 432304252416.0, "grad_norm": 0.08323404973511926, "language_loss": 0.79681003, "learning_rate": 0.0006782431974134316, "loss": 0.80762231, "num_input_tokens_seen": 174214656, "router_z_loss_mlp": 0.1227417, "routerloss_mlp": 0.0, "step": 2092, "time_per_iteration": 2.554500102996826 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085193, "balance_loss_mlp": 1.07266974, "diversity_loss_mlp": 0.0, "epoch": 0.4026548672566372, "flos": 766660640256.0, "grad_norm": 0.06323665884579813, "language_loss": 0.89339125, "learning_rate": 0.0006779520886918949, "loss": 0.90424317, "num_input_tokens_seen": 174296064, "router_z_loss_mlp": 0.12524414, "routerloss_mlp": 0.0, "step": 2093, "time_per_iteration": 3.0625791549682617 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109256, "balance_loss_mlp": 1.08038247, "diversity_loss_mlp": 0.0, "epoch": 0.4028472489419007, "flos": 642931914240.0, "grad_norm": 0.06591278584355922, "language_loss": 0.81594688, "learning_rate": 0.0006776609108832301, "loss": 0.82687247, "num_input_tokens_seen": 174370896, "router_z_loss_mlp": 0.12176514, "routerloss_mlp": 0.0, "step": 2094, "time_per_iteration": 2.84006929397583 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099273, "balance_loss_mlp": 1.08723903, "diversity_loss_mlp": 0.0, "epoch": 0.4030396306271643, "flos": 491838718464.0, "grad_norm": 0.07397134749055344, "language_loss": 0.84911013, "learning_rate": 0.0006773696641004828, "loss": 0.86010277, "num_input_tokens_seen": 174438448, "router_z_loss_mlp": 0.12030029, "routerloss_mlp": 0.0, "step": 2095, "time_per_iteration": 2.5662059783935547 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110289, "balance_loss_mlp": 1.09781969, "diversity_loss_mlp": 0.0, "epoch": 0.40323201231242783, "flos": 901728308736.0, "grad_norm": 0.07471072764212172, "language_loss": 0.77422667, "learning_rate": 0.0006770783484567247, "loss": 0.78532958, "num_input_tokens_seen": 174525952, "router_z_loss_mlp": 0.12475586, "routerloss_mlp": 0.0, "step": 2096, "time_per_iteration": 3.120000123977661 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106409, "balance_loss_mlp": 1.09445786, "diversity_loss_mlp": 0.0, "epoch": 0.4034243939976914, "flos": 570558961152.0, "grad_norm": 0.05645154934481913, "language_loss": 0.85885596, "learning_rate": 0.000676786964065055, "loss": 0.86992002, "num_input_tokens_seen": 174607200, "router_z_loss_mlp": 0.1194458, "routerloss_mlp": 0.0, "step": 2097, "time_per_iteration": 2.7947449684143066 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01109667, "balance_loss_mlp": 1.09767413, "diversity_loss_mlp": 0.0, "epoch": 0.403616775682955, "flos": 507456783360.0, "grad_norm": 0.06468702094514471, "language_loss": 0.78823644, "learning_rate": 0.0006764955110385986, "loss": 0.7993331, "num_input_tokens_seen": 174680976, "router_z_loss_mlp": 0.11987305, "routerloss_mlp": 0.0, "step": 2098, "time_per_iteration": 2.7805027961730957 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113539, "balance_loss_mlp": 1.10162365, "diversity_loss_mlp": 0.0, "epoch": 0.40380915736821854, "flos": 519383425536.0, "grad_norm": 0.06520165677387538, "language_loss": 0.80479109, "learning_rate": 0.0006762039894905083, "loss": 0.81592649, "num_input_tokens_seen": 174753152, "router_z_loss_mlp": 0.11901855, "routerloss_mlp": 0.0, "step": 2099, "time_per_iteration": 2.5934462547302246 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113864, "balance_loss_mlp": 1.10191941, "diversity_loss_mlp": 0.0, "epoch": 0.40400153905348213, "flos": 441925590528.0, "grad_norm": 0.07619139256642768, "language_loss": 0.80502266, "learning_rate": 0.000675912399533962, "loss": 0.81616127, "num_input_tokens_seen": 174817184, "router_z_loss_mlp": 0.11938477, "routerloss_mlp": 0.0, "step": 2100, "time_per_iteration": 2.5193917751312256 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0095878, "balance_loss_mlp": 1.67460704, "diversity_loss_mlp": 0.21229821, "epoch": 0.40419392073874566, "flos": 772309002240.0, "grad_norm": 0.026749352452392162, "language_loss": 0.8501215, "learning_rate": 0.0006756207412821656, "loss": 0.85970926, "num_input_tokens_seen": 174898128, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01532745, "step": 2101, "time_per_iteration": 3.0674142837524414 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01125351, "balance_loss_mlp": 1.11366224, "diversity_loss_mlp": 0.0, "epoch": 0.40438630242400925, "flos": 766569235968.0, "grad_norm": 0.07971707112625441, "language_loss": 0.80680853, "learning_rate": 0.0006753290148483505, "loss": 0.81806201, "num_input_tokens_seen": 174981872, "router_z_loss_mlp": 0.11682129, "routerloss_mlp": 0.0, "step": 2102, "time_per_iteration": 3.0177412033081055 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128591, "balance_loss_mlp": 1.11720061, "diversity_loss_mlp": 0.0, "epoch": 0.4045786841092728, "flos": 415235828736.0, "grad_norm": 0.07197972569419236, "language_loss": 0.78862077, "learning_rate": 0.0006750372203457752, "loss": 0.79990667, "num_input_tokens_seen": 175044976, "router_z_loss_mlp": 0.11383057, "routerloss_mlp": 0.0, "step": 2103, "time_per_iteration": 2.4715232849121094 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01133472, "balance_loss_mlp": 1.12199795, "diversity_loss_mlp": 0.0, "epoch": 0.40477106579453637, "flos": 539214174720.0, "grad_norm": 0.05679089538273026, "language_loss": 0.8629868, "learning_rate": 0.0006747453578877242, "loss": 0.87432158, "num_input_tokens_seen": 175121104, "router_z_loss_mlp": 0.11468506, "routerloss_mlp": 0.0, "step": 2104, "time_per_iteration": 2.7127907276153564 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01133404, "balance_loss_mlp": 1.12154305, "diversity_loss_mlp": 0.0, "epoch": 0.4049634474797999, "flos": 826704258048.0, "grad_norm": 0.07881786572134404, "language_loss": 0.83325595, "learning_rate": 0.0006744534275875085, "loss": 0.84459001, "num_input_tokens_seen": 175194512, "router_z_loss_mlp": 0.11853027, "routerloss_mlp": 0.0, "step": 2105, "time_per_iteration": 2.9968934059143066 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124156, "balance_loss_mlp": 1.11278331, "diversity_loss_mlp": 0.0, "epoch": 0.4051558291650635, "flos": 572684027904.0, "grad_norm": 0.06959652480101333, "language_loss": 0.85228348, "learning_rate": 0.0006741614295584657, "loss": 0.86352497, "num_input_tokens_seen": 175264176, "router_z_loss_mlp": 0.11364746, "routerloss_mlp": 0.0, "step": 2106, "time_per_iteration": 2.6837310791015625 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128388, "balance_loss_mlp": 1.1166873, "diversity_loss_mlp": 0.0, "epoch": 0.4053482108503271, "flos": 731881391616.0, "grad_norm": 0.07271017039443997, "language_loss": 0.78820735, "learning_rate": 0.0006738693639139595, "loss": 0.79949123, "num_input_tokens_seen": 175347488, "router_z_loss_mlp": 0.11688232, "routerloss_mlp": 0.0, "step": 2107, "time_per_iteration": 2.9876344203948975 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111468, "balance_loss_mlp": 1.09982085, "diversity_loss_mlp": 0.0, "epoch": 0.4055405925355906, "flos": 1213059193344.0, "grad_norm": 0.07545270814647756, "language_loss": 0.7770499, "learning_rate": 0.0006735772307673796, "loss": 0.78816462, "num_input_tokens_seen": 175438336, "router_z_loss_mlp": 0.11633301, "routerloss_mlp": 0.0, "step": 2108, "time_per_iteration": 3.5391368865966797 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112529, "balance_loss_mlp": 1.1007216, "diversity_loss_mlp": 0.0, "epoch": 0.4057329742208542, "flos": 715863204864.0, "grad_norm": 0.07028810729839409, "language_loss": 0.8317976, "learning_rate": 0.0006732850302321421, "loss": 0.84292281, "num_input_tokens_seen": 175510912, "router_z_loss_mlp": 0.11804199, "routerloss_mlp": 0.0, "step": 2109, "time_per_iteration": 2.924703359603882 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01107897, "balance_loss_mlp": 1.0962801, "diversity_loss_mlp": 0.0, "epoch": 0.4059253559061177, "flos": 564888577536.0, "grad_norm": 0.08331494403878895, "language_loss": 0.84220135, "learning_rate": 0.00067299276242169, "loss": 0.85328031, "num_input_tokens_seen": 175583040, "router_z_loss_mlp": 0.11608887, "routerloss_mlp": 0.0, "step": 2110, "time_per_iteration": 2.6628758907318115 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00591895, "balance_loss_mlp": 1.01285744, "diversity_loss_mlp": 0.15005666, "epoch": 0.4061177375913813, "flos": 1593744450048.0, "grad_norm": 0.0011574932258311419, "language_loss": 0.74382168, "learning_rate": 0.0006727004274494908, "loss": 0.74974066, "num_input_tokens_seen": 175817952, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01043818, "step": 2111, "time_per_iteration": 4.913798093795776 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100112, "balance_loss_mlp": 1.0884769, "diversity_loss_mlp": 0.0, "epoch": 0.40631011927664484, "flos": 615421711872.0, "grad_norm": 0.0671840972805921, "language_loss": 0.77974957, "learning_rate": 0.0006724080254290395, "loss": 0.79075068, "num_input_tokens_seen": 175896352, "router_z_loss_mlp": 0.11633301, "routerloss_mlp": 0.0, "step": 2112, "time_per_iteration": 2.790695905685425 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087298, "balance_loss_mlp": 1.07509685, "diversity_loss_mlp": 0.0, "epoch": 0.40650250096190843, "flos": 557661376512.0, "grad_norm": 0.06921545909042545, "language_loss": 0.89956391, "learning_rate": 0.0006721155564738566, "loss": 0.91043687, "num_input_tokens_seen": 175967152, "router_z_loss_mlp": 0.12200928, "routerloss_mlp": 0.0, "step": 2113, "time_per_iteration": 2.654052495956421 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00590146, "balance_loss_mlp": 1.01069736, "diversity_loss_mlp": 0.14874323, "epoch": 0.40669488264717196, "flos": 1580147564544.0, "grad_norm": 0.001129022163549877, "language_loss": 0.78622639, "learning_rate": 0.0006718230206974884, "loss": 0.79212785, "num_input_tokens_seen": 176205248, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01042587, "step": 2114, "time_per_iteration": 5.02890682220459 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095601, "balance_loss_mlp": 1.08348942, "diversity_loss_mlp": 0.0, "epoch": 0.40688726433243555, "flos": 507649503744.0, "grad_norm": 0.06673632265299649, "language_loss": 0.85678279, "learning_rate": 0.0006715304182135078, "loss": 0.86773884, "num_input_tokens_seen": 176276208, "router_z_loss_mlp": 0.12109375, "routerloss_mlp": 0.0, "step": 2115, "time_per_iteration": 2.6665151119232178 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092243, "balance_loss_mlp": 1.07951176, "diversity_loss_mlp": 0.0, "epoch": 0.40707964601769914, "flos": 589075172352.0, "grad_norm": 0.08902530655488881, "language_loss": 0.8859638, "learning_rate": 0.0006712377491355127, "loss": 0.89688623, "num_input_tokens_seen": 176355072, "router_z_loss_mlp": 0.12731934, "routerloss_mlp": 0.0, "step": 2116, "time_per_iteration": 2.9124083518981934 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091485, "balance_loss_mlp": 1.07896256, "diversity_loss_mlp": 0.0, "epoch": 0.40727202770296267, "flos": 580437259776.0, "grad_norm": 0.06275972542298792, "language_loss": 0.81009984, "learning_rate": 0.0006709450135771274, "loss": 0.8210147, "num_input_tokens_seen": 176444592, "router_z_loss_mlp": 0.12524414, "routerloss_mlp": 0.0, "step": 2117, "time_per_iteration": 2.9538469314575195 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109045, "balance_loss_mlp": 1.07800436, "diversity_loss_mlp": 0.0, "epoch": 0.40746440938822626, "flos": 504076649472.0, "grad_norm": 0.06731197780562713, "language_loss": 0.8655895, "learning_rate": 0.0006706522116520023, "loss": 0.87649393, "num_input_tokens_seen": 176516144, "router_z_loss_mlp": 0.12445068, "routerloss_mlp": 0.0, "step": 2118, "time_per_iteration": 2.6403684616088867 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109386, "balance_loss_mlp": 1.08127189, "diversity_loss_mlp": 0.0, "epoch": 0.4076567910734898, "flos": 605600312832.0, "grad_norm": 0.07339707473672348, "language_loss": 0.82936597, "learning_rate": 0.0006703593434738127, "loss": 0.84030455, "num_input_tokens_seen": 176585712, "router_z_loss_mlp": 0.12579346, "routerloss_mlp": 0.0, "step": 2119, "time_per_iteration": 2.706406354904175 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01096912, "balance_loss_mlp": 1.0847466, "diversity_loss_mlp": 0.0, "epoch": 0.4078491727587534, "flos": 479553799680.0, "grad_norm": 0.05750096894007485, "language_loss": 0.78123623, "learning_rate": 0.0006700664091562604, "loss": 0.79220533, "num_input_tokens_seen": 176654736, "router_z_loss_mlp": 0.12164307, "routerloss_mlp": 0.0, "step": 2120, "time_per_iteration": 2.5515992641448975 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01102687, "balance_loss_mlp": 1.09045601, "diversity_loss_mlp": 0.0, "epoch": 0.4080415544440169, "flos": 510384665088.0, "grad_norm": 0.08484846499370094, "language_loss": 0.85241771, "learning_rate": 0.0006697734088130725, "loss": 0.86344457, "num_input_tokens_seen": 176722800, "router_z_loss_mlp": 0.12231445, "routerloss_mlp": 0.0, "step": 2121, "time_per_iteration": 2.5997116565704346 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094995, "balance_loss_mlp": 1.08268619, "diversity_loss_mlp": 0.0, "epoch": 0.4082339361292805, "flos": 734638947840.0, "grad_norm": 0.06901349076849703, "language_loss": 0.85628182, "learning_rate": 0.0006694803425580018, "loss": 0.86723173, "num_input_tokens_seen": 176800320, "router_z_loss_mlp": 0.12310791, "routerloss_mlp": 0.0, "step": 2122, "time_per_iteration": 2.975572109222412 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090381, "balance_loss_mlp": 1.07825708, "diversity_loss_mlp": 0.0, "epoch": 0.4084263178145441, "flos": 457472074752.0, "grad_norm": 0.08123936309079019, "language_loss": 0.84420574, "learning_rate": 0.0006691872105048268, "loss": 0.85510951, "num_input_tokens_seen": 176867440, "router_z_loss_mlp": 0.12133789, "routerloss_mlp": 0.0, "step": 2123, "time_per_iteration": 2.5785253047943115 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109138, "balance_loss_mlp": 1.07879114, "diversity_loss_mlp": 0.0, "epoch": 0.4086186994998076, "flos": 562931638272.0, "grad_norm": 0.06700388653835253, "language_loss": 0.84703517, "learning_rate": 0.0006688940127673513, "loss": 0.85794896, "num_input_tokens_seen": 176942048, "router_z_loss_mlp": 0.12597656, "routerloss_mlp": 0.0, "step": 2124, "time_per_iteration": 2.794312000274658 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080411, "balance_loss_mlp": 1.06789398, "diversity_loss_mlp": 0.0, "epoch": 0.4088110811850712, "flos": 573669651456.0, "grad_norm": 0.11477925500015464, "language_loss": 0.85646629, "learning_rate": 0.0006686007494594049, "loss": 0.86727041, "num_input_tokens_seen": 177025104, "router_z_loss_mlp": 0.12524414, "routerloss_mlp": 0.0, "step": 2125, "time_per_iteration": 2.8629977703094482 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080594, "balance_loss_mlp": 1.06869102, "diversity_loss_mlp": 0.0, "epoch": 0.40900346287033473, "flos": 456930989568.0, "grad_norm": 0.08770785423003769, "language_loss": 0.80226219, "learning_rate": 0.0006683074206948425, "loss": 0.81306815, "num_input_tokens_seen": 177089296, "router_z_loss_mlp": 0.11901855, "routerloss_mlp": 0.0, "step": 2126, "time_per_iteration": 2.5477960109710693 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080619, "balance_loss_mlp": 1.06884146, "diversity_loss_mlp": 0.0, "epoch": 0.4091958445555983, "flos": 617395903488.0, "grad_norm": 0.0688791895715759, "language_loss": 0.81257784, "learning_rate": 0.0006680140265875443, "loss": 0.82338405, "num_input_tokens_seen": 177163648, "router_z_loss_mlp": 0.11767578, "routerloss_mlp": 0.0, "step": 2127, "time_per_iteration": 2.824706792831421 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076947, "balance_loss_mlp": 1.06504989, "diversity_loss_mlp": 0.0, "epoch": 0.40938822624086185, "flos": 472400750592.0, "grad_norm": 0.0706270365820259, "language_loss": 0.95744675, "learning_rate": 0.0006677205672514162, "loss": 0.96821618, "num_input_tokens_seen": 177233856, "router_z_loss_mlp": 0.11889648, "routerloss_mlp": 0.0, "step": 2128, "time_per_iteration": 2.6173171997070312 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081253, "balance_loss_mlp": 1.06944525, "diversity_loss_mlp": 0.0, "epoch": 0.40958060792612544, "flos": 570010535424.0, "grad_norm": 0.08385407721227026, "language_loss": 0.88751161, "learning_rate": 0.000667427042800389, "loss": 0.89832413, "num_input_tokens_seen": 177309824, "router_z_loss_mlp": 0.11804199, "routerloss_mlp": 0.0, "step": 2129, "time_per_iteration": 2.746561288833618 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090066, "balance_loss_mlp": 1.07828188, "diversity_loss_mlp": 0.0, "epoch": 0.40977298961138897, "flos": 609346063872.0, "grad_norm": 0.0802302808929841, "language_loss": 0.82728851, "learning_rate": 0.0006671334533484192, "loss": 0.83818918, "num_input_tokens_seen": 177380592, "router_z_loss_mlp": 0.11785889, "routerloss_mlp": 0.0, "step": 2130, "time_per_iteration": 2.7765390872955322 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094937, "balance_loss_mlp": 1.08306408, "diversity_loss_mlp": 0.0, "epoch": 0.40996537129665256, "flos": 581744457216.0, "grad_norm": 0.06494454218377498, "language_loss": 0.83394802, "learning_rate": 0.0006668397990094881, "loss": 0.84489739, "num_input_tokens_seen": 177454720, "router_z_loss_mlp": 0.11871338, "routerloss_mlp": 0.0, "step": 2131, "time_per_iteration": 2.6814444065093994 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094306, "balance_loss_mlp": 1.08240891, "diversity_loss_mlp": 0.0, "epoch": 0.41015775298191615, "flos": 516546948096.0, "grad_norm": 0.08851492372685672, "language_loss": 0.84863144, "learning_rate": 0.0006665460798976027, "loss": 0.8595745, "num_input_tokens_seen": 177528224, "router_z_loss_mlp": 0.11889648, "routerloss_mlp": 0.0, "step": 2132, "time_per_iteration": 2.734208822250366 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098079, "balance_loss_mlp": 1.08680749, "diversity_loss_mlp": 0.0, "epoch": 0.4103501346671797, "flos": 510354929664.0, "grad_norm": 0.07834997970618658, "language_loss": 0.8153789, "learning_rate": 0.0006662522961267947, "loss": 0.82635975, "num_input_tokens_seen": 177598176, "router_z_loss_mlp": 0.11273193, "routerloss_mlp": 0.0, "step": 2133, "time_per_iteration": 2.642789363861084 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100422, "balance_loss_mlp": 1.0889008, "diversity_loss_mlp": 0.0, "epoch": 0.41054251635244327, "flos": 549752126976.0, "grad_norm": 0.06175420460070233, "language_loss": 0.87238759, "learning_rate": 0.0006659584478111211, "loss": 0.88339174, "num_input_tokens_seen": 177675840, "router_z_loss_mlp": 0.1151123, "routerloss_mlp": 0.0, "step": 2134, "time_per_iteration": 2.8097283840179443 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110764, "balance_loss_mlp": 1.09618366, "diversity_loss_mlp": 0.0, "epoch": 0.4107348980377068, "flos": 839898450432.0, "grad_norm": 0.07261990262121029, "language_loss": 0.82762325, "learning_rate": 0.000665664535064664, "loss": 0.83869964, "num_input_tokens_seen": 177751376, "router_z_loss_mlp": 0.11468506, "routerloss_mlp": 0.0, "step": 2135, "time_per_iteration": 3.034973382949829 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118337, "balance_loss_mlp": 1.10702372, "diversity_loss_mlp": 0.0, "epoch": 0.4109272797229704, "flos": 503708461056.0, "grad_norm": 0.07277612177905571, "language_loss": 0.82753229, "learning_rate": 0.0006653705580015303, "loss": 0.83871567, "num_input_tokens_seen": 177825264, "router_z_loss_mlp": 0.11309814, "routerloss_mlp": 0.0, "step": 2136, "time_per_iteration": 2.719024181365967 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130623, "balance_loss_mlp": 1.11913705, "diversity_loss_mlp": 0.0, "epoch": 0.4111196614082339, "flos": 610830927360.0, "grad_norm": 0.09561286081072368, "language_loss": 0.86333638, "learning_rate": 0.0006650765167358523, "loss": 0.87464261, "num_input_tokens_seen": 177901680, "router_z_loss_mlp": 0.11474609, "routerloss_mlp": 0.0, "step": 2137, "time_per_iteration": 2.798013210296631 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119744, "balance_loss_mlp": 1.10816908, "diversity_loss_mlp": 0.0, "epoch": 0.4113120430934975, "flos": 453165414912.0, "grad_norm": 0.06575385598885217, "language_loss": 0.90120316, "learning_rate": 0.0006647824113817864, "loss": 0.9124006, "num_input_tokens_seen": 177965264, "router_z_loss_mlp": 0.11578369, "routerloss_mlp": 0.0, "step": 2138, "time_per_iteration": 2.5290029048919678 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00941862, "balance_loss_mlp": 1.64172852, "diversity_loss_mlp": 0.21382158, "epoch": 0.41150442477876104, "flos": 541600971264.0, "grad_norm": 0.027199696320483784, "language_loss": 0.81782889, "learning_rate": 0.000664488242053515, "loss": 0.8272475, "num_input_tokens_seen": 178039712, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01408678, "step": 2139, "time_per_iteration": 2.7610864639282227 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111286, "balance_loss_mlp": 1.1009748, "diversity_loss_mlp": 0.0, "epoch": 0.4116968064640246, "flos": 576291386880.0, "grad_norm": 0.07795493316399416, "language_loss": 0.83879304, "learning_rate": 0.0006641940088652445, "loss": 0.84992164, "num_input_tokens_seen": 178114080, "router_z_loss_mlp": 0.11877441, "routerloss_mlp": 0.0, "step": 2140, "time_per_iteration": 2.7797446250915527 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098794, "balance_loss_mlp": 1.08682573, "diversity_loss_mlp": 0.0, "epoch": 0.4118891881492882, "flos": 496115642880.0, "grad_norm": 0.09321248474614077, "language_loss": 0.82214057, "learning_rate": 0.0006638997119312065, "loss": 0.83312857, "num_input_tokens_seen": 178188032, "router_z_loss_mlp": 0.11962891, "routerloss_mlp": 0.0, "step": 2141, "time_per_iteration": 2.688427209854126 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082721, "balance_loss_mlp": 1.07580638, "diversity_loss_mlp": 0.0, "epoch": 0.41208156983455174, "flos": 1538395877376.0, "grad_norm": 0.05051376163622262, "language_loss": 0.75063306, "learning_rate": 0.0006636053513656568, "loss": 0.76146024, "num_input_tokens_seen": 178395328, "router_z_loss_mlp": 0.06933594, "routerloss_mlp": 0.0, "step": 2142, "time_per_iteration": 4.916438817977905 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084339, "balance_loss_mlp": 1.07186329, "diversity_loss_mlp": 0.0, "epoch": 0.41227395151981533, "flos": 584968946688.0, "grad_norm": 0.0666522569579182, "language_loss": 0.84487629, "learning_rate": 0.000663310927282877, "loss": 0.85571963, "num_input_tokens_seen": 178471952, "router_z_loss_mlp": 0.12475586, "routerloss_mlp": 0.0, "step": 2143, "time_per_iteration": 2.742781162261963 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075707, "balance_loss_mlp": 1.06302905, "diversity_loss_mlp": 0.0, "epoch": 0.41246633320507886, "flos": 442926268416.0, "grad_norm": 0.07553146792883669, "language_loss": 0.85816187, "learning_rate": 0.000663016439797172, "loss": 0.86891896, "num_input_tokens_seen": 178542192, "router_z_loss_mlp": 0.12677002, "routerloss_mlp": 0.0, "step": 2144, "time_per_iteration": 2.602322578430176 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075852, "balance_loss_mlp": 1.06363273, "diversity_loss_mlp": 0.0, "epoch": 0.41265871489034245, "flos": 579962985984.0, "grad_norm": 0.09188682549299809, "language_loss": 0.80924189, "learning_rate": 0.0006627218890228724, "loss": 0.82000041, "num_input_tokens_seen": 178622736, "router_z_loss_mlp": 0.12213135, "routerloss_mlp": 0.0, "step": 2145, "time_per_iteration": 2.76790452003479 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081048, "balance_loss_mlp": 1.0687809, "diversity_loss_mlp": 0.0, "epoch": 0.412851096575606, "flos": 761229964800.0, "grad_norm": 0.09235653357512275, "language_loss": 0.83860421, "learning_rate": 0.0006624272750743326, "loss": 0.84941471, "num_input_tokens_seen": 178705808, "router_z_loss_mlp": 0.12261963, "routerloss_mlp": 0.0, "step": 2146, "time_per_iteration": 2.986267566680908 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085622, "balance_loss_mlp": 1.073385, "diversity_loss_mlp": 0.0, "epoch": 0.41304347826086957, "flos": 555353501184.0, "grad_norm": 0.06221373460159241, "language_loss": 0.82866907, "learning_rate": 0.0006621325980659322, "loss": 0.83952528, "num_input_tokens_seen": 178781200, "router_z_loss_mlp": 0.12231445, "routerloss_mlp": 0.0, "step": 2147, "time_per_iteration": 2.78074049949646 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091906, "balance_loss_mlp": 1.07981253, "diversity_loss_mlp": 0.0, "epoch": 0.41323585994613315, "flos": 665712940032.0, "grad_norm": 0.06655163113776748, "language_loss": 0.81613219, "learning_rate": 0.000661837858112075, "loss": 0.82705128, "num_input_tokens_seen": 178855072, "router_z_loss_mlp": 0.12097168, "routerloss_mlp": 0.0, "step": 2148, "time_per_iteration": 2.8118457794189453 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00920817, "balance_loss_mlp": 1.59947157, "diversity_loss_mlp": 0.21162269, "epoch": 0.4134282416313967, "flos": 548699692032.0, "grad_norm": 0.03430222900415099, "language_loss": 0.88696158, "learning_rate": 0.0006615430553271888, "loss": 0.89616972, "num_input_tokens_seen": 178927936, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01526995, "step": 2149, "time_per_iteration": 2.809389352798462 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115952, "balance_loss_mlp": 1.10438299, "diversity_loss_mlp": 0.0, "epoch": 0.4136206233166603, "flos": 646262489088.0, "grad_norm": 0.06824786639125466, "language_loss": 0.85333586, "learning_rate": 0.0006612481898257264, "loss": 0.8644954, "num_input_tokens_seen": 179007792, "router_z_loss_mlp": 0.11566162, "routerloss_mlp": 0.0, "step": 2150, "time_per_iteration": 2.855074644088745 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01137757, "balance_loss_mlp": 1.12599659, "diversity_loss_mlp": 0.0, "epoch": 0.4138130050019238, "flos": 517354905600.0, "grad_norm": 0.07789693292988349, "language_loss": 0.851385, "learning_rate": 0.000660953261722165, "loss": 0.86276257, "num_input_tokens_seen": 179075200, "router_z_loss_mlp": 0.11749268, "routerloss_mlp": 0.0, "step": 2151, "time_per_iteration": 2.5938022136688232 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0113704, "balance_loss_mlp": 1.12522054, "diversity_loss_mlp": 0.0, "epoch": 0.4140053866871874, "flos": 609254659584.0, "grad_norm": 0.08228338378299185, "language_loss": 0.82884097, "learning_rate": 0.0006606582711310055, "loss": 0.84021133, "num_input_tokens_seen": 179144448, "router_z_loss_mlp": 0.11816406, "routerloss_mlp": 0.0, "step": 2152, "time_per_iteration": 2.7282497882843018 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01145368, "balance_loss_mlp": 1.13366747, "diversity_loss_mlp": 0.0, "epoch": 0.4141977683724509, "flos": 579762925056.0, "grad_norm": 0.06559194318793425, "language_loss": 0.82812124, "learning_rate": 0.0006603632181667736, "loss": 0.83957493, "num_input_tokens_seen": 179215776, "router_z_loss_mlp": 0.11688232, "routerloss_mlp": 0.0, "step": 2153, "time_per_iteration": 2.6664750576019287 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103836, "balance_loss_mlp": 1.09754133, "diversity_loss_mlp": 0.0, "epoch": 0.4143901500577145, "flos": 1307312317440.0, "grad_norm": 0.03767833543400207, "language_loss": 0.78943324, "learning_rate": 0.0006600681029440187, "loss": 0.8004716, "num_input_tokens_seen": 179436688, "router_z_loss_mlp": 0.06298828, "routerloss_mlp": 0.0, "step": 2154, "time_per_iteration": 4.910309791564941 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01135237, "balance_loss_mlp": 1.12367392, "diversity_loss_mlp": 0.0, "epoch": 0.41458253174297804, "flos": 460189983744.0, "grad_norm": 0.0807614788835298, "language_loss": 0.81897664, "learning_rate": 0.0006597729255773153, "loss": 0.83032906, "num_input_tokens_seen": 179503264, "router_z_loss_mlp": 0.11560059, "routerloss_mlp": 0.0, "step": 2155, "time_per_iteration": 2.509021520614624 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01146058, "balance_loss_mlp": 1.13441765, "diversity_loss_mlp": 0.0, "epoch": 0.41477491342824163, "flos": 553364628480.0, "grad_norm": 0.07993173196210833, "language_loss": 0.82465029, "learning_rate": 0.0006594776861812608, "loss": 0.83611095, "num_input_tokens_seen": 179574864, "router_z_loss_mlp": 0.11633301, "routerloss_mlp": 0.0, "step": 2156, "time_per_iteration": 2.656454086303711 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01151315, "balance_loss_mlp": 1.13991857, "diversity_loss_mlp": 0.0, "epoch": 0.4149672951135052, "flos": 697771708416.0, "grad_norm": 0.06494614409867079, "language_loss": 0.8654387, "learning_rate": 0.0006591823848704776, "loss": 0.87695187, "num_input_tokens_seen": 179658208, "router_z_loss_mlp": 0.11395264, "routerloss_mlp": 0.0, "step": 2157, "time_per_iteration": 2.9039251804351807 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01134696, "balance_loss_mlp": 1.12316287, "diversity_loss_mlp": 0.0, "epoch": 0.41515967679876875, "flos": 565750863360.0, "grad_norm": 0.07584878913150254, "language_loss": 0.81510401, "learning_rate": 0.0006588870217596117, "loss": 0.82645094, "num_input_tokens_seen": 179732320, "router_z_loss_mlp": 0.11517334, "routerloss_mlp": 0.0, "step": 2158, "time_per_iteration": 2.7366249561309814 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121714, "balance_loss_mlp": 1.11010289, "diversity_loss_mlp": 0.0, "epoch": 0.41535205848403234, "flos": 501185843712.0, "grad_norm": 0.0768974217493938, "language_loss": 0.8567549, "learning_rate": 0.0006585915969633334, "loss": 0.86797202, "num_input_tokens_seen": 179801616, "router_z_loss_mlp": 0.11602783, "routerloss_mlp": 0.0, "step": 2159, "time_per_iteration": 2.557969331741333 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105923, "balance_loss_mlp": 1.09437764, "diversity_loss_mlp": 0.0, "epoch": 0.41554444016929587, "flos": 607554680832.0, "grad_norm": 0.06453825749462137, "language_loss": 0.89545041, "learning_rate": 0.0006582961105963366, "loss": 0.90650964, "num_input_tokens_seen": 179876112, "router_z_loss_mlp": 0.11547852, "routerloss_mlp": 0.0, "step": 2160, "time_per_iteration": 2.782766103744507 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089959, "balance_loss_mlp": 1.07836008, "diversity_loss_mlp": 0.0, "epoch": 0.41573682185455946, "flos": 529115991552.0, "grad_norm": 0.09389311079563152, "language_loss": 0.77639234, "learning_rate": 0.0006580005627733395, "loss": 0.78729188, "num_input_tokens_seen": 179949936, "router_z_loss_mlp": 0.11590576, "routerloss_mlp": 0.0, "step": 2161, "time_per_iteration": 2.7049734592437744 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01086569, "balance_loss_mlp": 1.07492197, "diversity_loss_mlp": 0.0, "epoch": 0.415929203539823, "flos": 504956187648.0, "grad_norm": 0.08236412019602501, "language_loss": 0.81618345, "learning_rate": 0.0006577049536090838, "loss": 0.8270492, "num_input_tokens_seen": 180023184, "router_z_loss_mlp": 0.11645508, "routerloss_mlp": 0.0, "step": 2162, "time_per_iteration": 2.723243236541748 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078674, "balance_loss_mlp": 1.06676459, "diversity_loss_mlp": 0.0, "epoch": 0.4161215852250866, "flos": 582737794560.0, "grad_norm": 0.09869721655750711, "language_loss": 0.85591501, "learning_rate": 0.000657409283218335, "loss": 0.86670172, "num_input_tokens_seen": 180091728, "router_z_loss_mlp": 0.11901855, "routerloss_mlp": 0.0, "step": 2163, "time_per_iteration": 2.64973783493042 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078005, "balance_loss_mlp": 1.0662148, "diversity_loss_mlp": 0.0, "epoch": 0.4163139669103501, "flos": 490697077248.0, "grad_norm": 0.06806079796586995, "language_loss": 0.81014043, "learning_rate": 0.0006571135517158829, "loss": 0.82092047, "num_input_tokens_seen": 180162096, "router_z_loss_mlp": 0.11785889, "routerloss_mlp": 0.0, "step": 2164, "time_per_iteration": 2.6662614345550537 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01261192, "balance_loss_mlp": 1.25542271, "diversity_loss_mlp": 0.0, "epoch": 0.4165063485956137, "flos": 1288158474240.0, "grad_norm": 0.0963910676883023, "language_loss": 0.76764059, "learning_rate": 0.0006568177592165404, "loss": 0.78025252, "num_input_tokens_seen": 180380912, "router_z_loss_mlp": 0.05761719, "routerloss_mlp": 0.0, "step": 2165, "time_per_iteration": 4.733267068862915 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084468, "balance_loss_mlp": 1.07227921, "diversity_loss_mlp": 0.0, "epoch": 0.4166987302808773, "flos": 495263268864.0, "grad_norm": 0.08489426271121504, "language_loss": 0.83098751, "learning_rate": 0.0006565219058351444, "loss": 0.84183216, "num_input_tokens_seen": 180447424, "router_z_loss_mlp": 0.12194824, "routerloss_mlp": 0.0, "step": 2166, "time_per_iteration": 2.555367946624756 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087654, "balance_loss_mlp": 1.07506573, "diversity_loss_mlp": 0.0, "epoch": 0.4168911119661408, "flos": 464071555584.0, "grad_norm": 0.0663020588108057, "language_loss": 0.82663929, "learning_rate": 0.0006562259916865553, "loss": 0.83751583, "num_input_tokens_seen": 180516336, "router_z_loss_mlp": 0.12585449, "routerloss_mlp": 0.0, "step": 2167, "time_per_iteration": 2.5647947788238525 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085528, "balance_loss_mlp": 1.07305884, "diversity_loss_mlp": 0.0, "epoch": 0.4170834936514044, "flos": 536787730944.0, "grad_norm": 0.11811458423881586, "language_loss": 0.79392177, "learning_rate": 0.0006559300168856573, "loss": 0.80477709, "num_input_tokens_seen": 180589824, "router_z_loss_mlp": 0.12481689, "routerloss_mlp": 0.0, "step": 2168, "time_per_iteration": 2.737071990966797 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090803, "balance_loss_mlp": 1.07860184, "diversity_loss_mlp": 0.0, "epoch": 0.41727587533666793, "flos": 550683795456.0, "grad_norm": 0.07183663020795078, "language_loss": 0.86060214, "learning_rate": 0.0006556339815473577, "loss": 0.87151015, "num_input_tokens_seen": 180661296, "router_z_loss_mlp": 0.12200928, "routerloss_mlp": 0.0, "step": 2169, "time_per_iteration": 2.6506707668304443 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087149, "balance_loss_mlp": 1.07504892, "diversity_loss_mlp": 0.0, "epoch": 0.4174682570219315, "flos": 631111357440.0, "grad_norm": 0.07609133400056706, "language_loss": 0.86409211, "learning_rate": 0.000655337885786588, "loss": 0.87496364, "num_input_tokens_seen": 180744896, "router_z_loss_mlp": 0.12103271, "routerloss_mlp": 0.0, "step": 2170, "time_per_iteration": 2.8835949897766113 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078201, "balance_loss_mlp": 1.06654263, "diversity_loss_mlp": 0.0, "epoch": 0.41766063870719505, "flos": 519751613952.0, "grad_norm": 0.08298304012821277, "language_loss": 0.85129267, "learning_rate": 0.0006550417297183025, "loss": 0.86207461, "num_input_tokens_seen": 180813008, "router_z_loss_mlp": 0.11651611, "routerloss_mlp": 0.0, "step": 2171, "time_per_iteration": 2.6195385456085205 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087717, "balance_loss_mlp": 1.07584357, "diversity_loss_mlp": 0.0, "epoch": 0.41785302039245864, "flos": 557935589376.0, "grad_norm": 0.07223590906341684, "language_loss": 0.81395489, "learning_rate": 0.0006547455134574793, "loss": 0.82483202, "num_input_tokens_seen": 180886480, "router_z_loss_mlp": 0.11877441, "routerloss_mlp": 0.0, "step": 2172, "time_per_iteration": 2.688387155532837 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091071, "balance_loss_mlp": 1.07947183, "diversity_loss_mlp": 0.0, "epoch": 0.41804540207772223, "flos": 788529821184.0, "grad_norm": 0.06986640066350178, "language_loss": 0.84520721, "learning_rate": 0.0006544492371191198, "loss": 0.85611784, "num_input_tokens_seen": 180973776, "router_z_loss_mlp": 0.11590576, "routerloss_mlp": 0.0, "step": 2173, "time_per_iteration": 3.1099753379821777 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094341, "balance_loss_mlp": 1.08226562, "diversity_loss_mlp": 0.0, "epoch": 0.41823778376298576, "flos": 904332418560.0, "grad_norm": 0.06657472623207703, "language_loss": 0.8341983, "learning_rate": 0.0006541529008182485, "loss": 0.84514177, "num_input_tokens_seen": 181062768, "router_z_loss_mlp": 0.12072754, "routerloss_mlp": 0.0, "step": 2174, "time_per_iteration": 3.203376054763794 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01107353, "balance_loss_mlp": 1.09567666, "diversity_loss_mlp": 0.0, "epoch": 0.41843016544824935, "flos": 511560811008.0, "grad_norm": 0.07167092475387357, "language_loss": 0.87561977, "learning_rate": 0.0006538565046699136, "loss": 0.8866933, "num_input_tokens_seen": 181129872, "router_z_loss_mlp": 0.11669922, "routerloss_mlp": 0.0, "step": 2175, "time_per_iteration": 2.6136248111724854 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01122872, "balance_loss_mlp": 1.1111474, "diversity_loss_mlp": 0.0, "epoch": 0.4186225471335129, "flos": 653077085184.0, "grad_norm": 0.08073018870716439, "language_loss": 0.81308544, "learning_rate": 0.0006535600487891862, "loss": 0.82431418, "num_input_tokens_seen": 181208112, "router_z_loss_mlp": 0.1171875, "routerloss_mlp": 0.0, "step": 2176, "time_per_iteration": 2.8484995365142822 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112142, "balance_loss_mlp": 1.10968423, "diversity_loss_mlp": 0.0, "epoch": 0.41881492881877647, "flos": 569158161408.0, "grad_norm": 0.06933020813080157, "language_loss": 0.89047962, "learning_rate": 0.0006532635332911603, "loss": 0.90169382, "num_input_tokens_seen": 181278736, "router_z_loss_mlp": 0.11730957, "routerloss_mlp": 0.0, "step": 2177, "time_per_iteration": 2.6983814239501953 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01139797, "balance_loss_mlp": 1.12828767, "diversity_loss_mlp": 0.0, "epoch": 0.41900731050404, "flos": 911878248960.0, "grad_norm": 0.07833316419755533, "language_loss": 0.80340332, "learning_rate": 0.0006529669582909541, "loss": 0.81480134, "num_input_tokens_seen": 181362512, "router_z_loss_mlp": 0.11499023, "routerloss_mlp": 0.0, "step": 2178, "time_per_iteration": 3.247034788131714 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130167, "balance_loss_mlp": 1.11881781, "diversity_loss_mlp": 0.0, "epoch": 0.4191996921893036, "flos": 535755119616.0, "grad_norm": 0.08850961832331757, "language_loss": 0.85867965, "learning_rate": 0.0006526703239037077, "loss": 0.86998129, "num_input_tokens_seen": 181432080, "router_z_loss_mlp": 0.11346436, "routerloss_mlp": 0.0, "step": 2179, "time_per_iteration": 2.6653683185577393 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00933718, "balance_loss_mlp": 1.62844765, "diversity_loss_mlp": 0.20954823, "epoch": 0.4193920738745671, "flos": 582636478464.0, "grad_norm": 0.029582524443817385, "language_loss": 0.86593473, "learning_rate": 0.0006523736302445851, "loss": 0.87527192, "num_input_tokens_seen": 181507296, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01471971, "step": 2180, "time_per_iteration": 2.857030153274536 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01120353, "balance_loss_mlp": 1.10893881, "diversity_loss_mlp": 0.0, "epoch": 0.4195844555598307, "flos": 1335782472192.0, "grad_norm": 0.0687803817541909, "language_loss": 0.77392578, "learning_rate": 0.0006520768774287728, "loss": 0.78512931, "num_input_tokens_seen": 181599408, "router_z_loss_mlp": 0.11413574, "routerloss_mlp": 0.0, "step": 2181, "time_per_iteration": 5.625683307647705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01114289, "balance_loss_mlp": 1.10282135, "diversity_loss_mlp": 0.0, "epoch": 0.4197768372450943, "flos": 598783145472.0, "grad_norm": 0.06088029266780351, "language_loss": 0.85493296, "learning_rate": 0.0006517800655714806, "loss": 0.86607587, "num_input_tokens_seen": 181674944, "router_z_loss_mlp": 0.11462402, "routerloss_mlp": 0.0, "step": 2182, "time_per_iteration": 2.812955617904663 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105622, "balance_loss_mlp": 1.09442866, "diversity_loss_mlp": 0.0, "epoch": 0.4199692189303578, "flos": 735261525504.0, "grad_norm": 0.07098705372074567, "language_loss": 0.85399854, "learning_rate": 0.0006514831947879407, "loss": 0.86505473, "num_input_tokens_seen": 181756704, "router_z_loss_mlp": 0.11193848, "routerloss_mlp": 0.0, "step": 2183, "time_per_iteration": 2.961418867111206 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097617, "balance_loss_mlp": 1.08642888, "diversity_loss_mlp": 0.0, "epoch": 0.4201616006156214, "flos": 750214794240.0, "grad_norm": 0.08450852264083888, "language_loss": 0.78323019, "learning_rate": 0.0006511862651934091, "loss": 0.79420632, "num_input_tokens_seen": 181837952, "router_z_loss_mlp": 0.11181641, "routerloss_mlp": 0.0, "step": 2184, "time_per_iteration": 3.076414108276367 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091172, "balance_loss_mlp": 1.07956707, "diversity_loss_mlp": 0.0, "epoch": 0.42035398230088494, "flos": 547029448704.0, "grad_norm": 0.06921087236063693, "language_loss": 0.82092035, "learning_rate": 0.0006508892769031638, "loss": 0.83183205, "num_input_tokens_seen": 181906896, "router_z_loss_mlp": 0.11602783, "routerloss_mlp": 0.0, "step": 2185, "time_per_iteration": 2.638606309890747 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089875, "balance_loss_mlp": 1.07868707, "diversity_loss_mlp": 0.0, "epoch": 0.42054636398614853, "flos": 616911717888.0, "grad_norm": 0.07895440454445611, "language_loss": 0.87322706, "learning_rate": 0.000650592230032506, "loss": 0.88412583, "num_input_tokens_seen": 181974976, "router_z_loss_mlp": 0.11187744, "routerloss_mlp": 0.0, "step": 2186, "time_per_iteration": 2.702061176300049 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01093256, "balance_loss_mlp": 1.0815382, "diversity_loss_mlp": 0.0, "epoch": 0.42073874567141206, "flos": 640394242560.0, "grad_norm": 0.07748698496632533, "language_loss": 0.85121393, "learning_rate": 0.0006502951246967595, "loss": 0.8621465, "num_input_tokens_seen": 182054704, "router_z_loss_mlp": 0.11706543, "routerloss_mlp": 0.0, "step": 2187, "time_per_iteration": 2.871629476547241 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087261, "balance_loss_mlp": 1.07582331, "diversity_loss_mlp": 0.0, "epoch": 0.42093112735667565, "flos": 493783174656.0, "grad_norm": 0.06016607527200091, "language_loss": 0.86913472, "learning_rate": 0.0006499979610112706, "loss": 0.88000733, "num_input_tokens_seen": 182129696, "router_z_loss_mlp": 0.11425781, "routerloss_mlp": 0.0, "step": 2188, "time_per_iteration": 2.795278787612915 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107962, "balance_loss_mlp": 1.06803894, "diversity_loss_mlp": 0.0, "epoch": 0.4211235090419392, "flos": 542364512256.0, "grad_norm": 0.0593739697007924, "language_loss": 0.84024572, "learning_rate": 0.000649700739091409, "loss": 0.85104191, "num_input_tokens_seen": 182203792, "router_z_loss_mlp": 0.11572266, "routerloss_mlp": 0.0, "step": 2189, "time_per_iteration": 2.822756290435791 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01123251, "balance_loss_mlp": 1.11500144, "diversity_loss_mlp": 0.0, "epoch": 0.42131589072720277, "flos": 1532149530624.0, "grad_norm": 0.03860831682793276, "language_loss": 0.73836273, "learning_rate": 0.0006494034590525657, "loss": 0.74959522, "num_input_tokens_seen": 182432080, "router_z_loss_mlp": 0.08251953, "routerloss_mlp": 0.0, "step": 2190, "time_per_iteration": 4.79919958114624 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082066, "balance_loss_mlp": 1.07052088, "diversity_loss_mlp": 0.0, "epoch": 0.42150827241246636, "flos": 566852857344.0, "grad_norm": 0.06761793691364075, "language_loss": 0.85737348, "learning_rate": 0.0006491061210101557, "loss": 0.86819422, "num_input_tokens_seen": 182500256, "router_z_loss_mlp": 0.11535645, "routerloss_mlp": 0.0, "step": 2191, "time_per_iteration": 2.661578416824341 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094285, "balance_loss_mlp": 1.08270931, "diversity_loss_mlp": 0.0, "epoch": 0.4217006540977299, "flos": 707561174016.0, "grad_norm": 0.0725556462678514, "language_loss": 0.83956218, "learning_rate": 0.0006488087250796157, "loss": 0.85050505, "num_input_tokens_seen": 182582912, "router_z_loss_mlp": 0.11572266, "routerloss_mlp": 0.0, "step": 2192, "time_per_iteration": 2.881225347518921 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095445, "balance_loss_mlp": 1.08376861, "diversity_loss_mlp": 0.0, "epoch": 0.4218930357829935, "flos": 627291454464.0, "grad_norm": 0.09298126342392905, "language_loss": 0.81662476, "learning_rate": 0.0006485112713764049, "loss": 0.82757914, "num_input_tokens_seen": 182670304, "router_z_loss_mlp": 0.11669922, "routerloss_mlp": 0.0, "step": 2193, "time_per_iteration": 2.8921914100646973 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01093733, "balance_loss_mlp": 1.08214593, "diversity_loss_mlp": 0.0, "epoch": 0.422085417468257, "flos": 460345628160.0, "grad_norm": 0.058244545196029895, "language_loss": 0.83715278, "learning_rate": 0.0006482137600160051, "loss": 0.84809017, "num_input_tokens_seen": 182735024, "router_z_loss_mlp": 0.11572266, "routerloss_mlp": 0.0, "step": 2194, "time_per_iteration": 2.484341859817505 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094196, "balance_loss_mlp": 1.08240056, "diversity_loss_mlp": 0.0, "epoch": 0.4222777991535206, "flos": 474026577408.0, "grad_norm": 0.08574033239321836, "language_loss": 0.847399, "learning_rate": 0.0006479161911139206, "loss": 0.85834098, "num_input_tokens_seen": 182805024, "router_z_loss_mlp": 0.11791992, "routerloss_mlp": 0.0, "step": 2195, "time_per_iteration": 2.5937106609344482 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082105, "balance_loss_mlp": 1.07043433, "diversity_loss_mlp": 0.0, "epoch": 0.4224701808387841, "flos": 470886151680.0, "grad_norm": 0.08791937036502419, "language_loss": 0.85522735, "learning_rate": 0.0006476185647856778, "loss": 0.86604846, "num_input_tokens_seen": 182871360, "router_z_loss_mlp": 0.11657715, "routerloss_mlp": 0.0, "step": 2196, "time_per_iteration": 2.569899559020996 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080125, "balance_loss_mlp": 1.06815672, "diversity_loss_mlp": 0.0, "epoch": 0.4226625625240477, "flos": 677525783040.0, "grad_norm": 0.07778870715402122, "language_loss": 0.82192588, "learning_rate": 0.0006473208811468255, "loss": 0.83272707, "num_input_tokens_seen": 182952912, "router_z_loss_mlp": 0.11962891, "routerloss_mlp": 0.0, "step": 2197, "time_per_iteration": 2.899557113647461 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072137, "balance_loss_mlp": 1.06046605, "diversity_loss_mlp": 0.0, "epoch": 0.4228549442093113, "flos": 503525652480.0, "grad_norm": 0.07330307904629892, "language_loss": 0.84140831, "learning_rate": 0.0006470231403129347, "loss": 0.85212964, "num_input_tokens_seen": 183022016, "router_z_loss_mlp": 0.11663818, "routerloss_mlp": 0.0, "step": 2198, "time_per_iteration": 2.602447509765625 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106933, "balance_loss_mlp": 1.05760026, "diversity_loss_mlp": 0.0, "epoch": 0.42304732589457483, "flos": 611848857600.0, "grad_norm": 0.06409293690085444, "language_loss": 0.81590885, "learning_rate": 0.0006467253423995988, "loss": 0.82660222, "num_input_tokens_seen": 183101776, "router_z_loss_mlp": 0.11712646, "routerloss_mlp": 0.0, "step": 2199, "time_per_iteration": 2.8557229042053223 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107934, "balance_loss_mlp": 1.06755078, "diversity_loss_mlp": 0.0, "epoch": 0.4232397075798384, "flos": 515570863104.0, "grad_norm": 0.07244216805562081, "language_loss": 0.78831869, "learning_rate": 0.000646427487522433, "loss": 0.79911208, "num_input_tokens_seen": 183171392, "router_z_loss_mlp": 0.11773682, "routerloss_mlp": 0.0, "step": 2200, "time_per_iteration": 2.65742826461792 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084908, "balance_loss_mlp": 1.07336855, "diversity_loss_mlp": 0.0, "epoch": 0.42343208926510195, "flos": 589796868096.0, "grad_norm": 0.07121994515744344, "language_loss": 0.83032513, "learning_rate": 0.0006461295757970749, "loss": 0.84117424, "num_input_tokens_seen": 183253936, "router_z_loss_mlp": 0.11523438, "routerloss_mlp": 0.0, "step": 2201, "time_per_iteration": 2.950655698776245 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090426, "balance_loss_mlp": 1.07880902, "diversity_loss_mlp": 0.0, "epoch": 0.42362447095036554, "flos": 640636521984.0, "grad_norm": 0.07713064950594434, "language_loss": 0.81538546, "learning_rate": 0.0006458316073391839, "loss": 0.82628965, "num_input_tokens_seen": 183333744, "router_z_loss_mlp": 0.1161499, "routerloss_mlp": 0.0, "step": 2202, "time_per_iteration": 2.8609914779663086 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089677, "balance_loss_mlp": 1.07874584, "diversity_loss_mlp": 0.0, "epoch": 0.42381685263562907, "flos": 512680057344.0, "grad_norm": 0.07022827859020209, "language_loss": 0.87709206, "learning_rate": 0.0006455335822644422, "loss": 0.88798881, "num_input_tokens_seen": 183401904, "router_z_loss_mlp": 0.109375, "routerloss_mlp": 0.0, "step": 2203, "time_per_iteration": 2.6978323459625244 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118526, "balance_loss_mlp": 1.10743332, "diversity_loss_mlp": 0.0, "epoch": 0.42400923432089266, "flos": 546782400000.0, "grad_norm": 0.08724206882012846, "language_loss": 0.78530163, "learning_rate": 0.0006452355006885527, "loss": 0.79648691, "num_input_tokens_seen": 183471312, "router_z_loss_mlp": 0.11090088, "routerloss_mlp": 0.0, "step": 2204, "time_per_iteration": 2.686579704284668 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00922718, "balance_loss_mlp": 1.60671031, "diversity_loss_mlp": 0.20807257, "epoch": 0.4242016160061562, "flos": 622154815488.0, "grad_norm": 0.038668439213979985, "language_loss": 0.8761735, "learning_rate": 0.0006449373627272412, "loss": 0.88540065, "num_input_tokens_seen": 183539184, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01532654, "step": 2205, "time_per_iteration": 2.7558722496032715 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112883, "balance_loss_mlp": 1.10164738, "diversity_loss_mlp": 0.0, "epoch": 0.4243939976914198, "flos": 571913146368.0, "grad_norm": 0.08032286277613819, "language_loss": 0.82142913, "learning_rate": 0.0006446391684962553, "loss": 0.83255792, "num_input_tokens_seen": 183607504, "router_z_loss_mlp": 0.11230469, "routerloss_mlp": 0.0, "step": 2206, "time_per_iteration": 2.6579248905181885 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117757, "balance_loss_mlp": 1.10650921, "diversity_loss_mlp": 0.0, "epoch": 0.42458637937668336, "flos": 448740186624.0, "grad_norm": 0.06707307211931093, "language_loss": 0.82899106, "learning_rate": 0.000644340918111364, "loss": 0.8401686, "num_input_tokens_seen": 183674720, "router_z_loss_mlp": 0.11248779, "routerloss_mlp": 0.0, "step": 2207, "time_per_iteration": 2.5347208976745605 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01117145, "balance_loss_mlp": 1.10573626, "diversity_loss_mlp": 0.0, "epoch": 0.4247787610619469, "flos": 435407602176.0, "grad_norm": 0.09153331321335235, "language_loss": 0.84820396, "learning_rate": 0.0006440426116883585, "loss": 0.85937536, "num_input_tokens_seen": 183740448, "router_z_loss_mlp": 0.11401367, "routerloss_mlp": 0.0, "step": 2208, "time_per_iteration": 2.5513036251068115 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112411, "balance_loss_mlp": 1.11258864, "diversity_loss_mlp": 0.0, "epoch": 0.4249711427472105, "flos": 496078566912.0, "grad_norm": 0.07442494649717855, "language_loss": 0.86227304, "learning_rate": 0.0006437442493430519, "loss": 0.87351412, "num_input_tokens_seen": 183812640, "router_z_loss_mlp": 0.11523438, "routerloss_mlp": 0.0, "step": 2209, "time_per_iteration": 2.6560840606689453 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01120019, "balance_loss_mlp": 1.10829473, "diversity_loss_mlp": 0.0, "epoch": 0.425163524432474, "flos": 655819587072.0, "grad_norm": 0.09545289030190586, "language_loss": 0.86441422, "learning_rate": 0.000643445831191278, "loss": 0.8756144, "num_input_tokens_seen": 183895312, "router_z_loss_mlp": 0.1171875, "routerloss_mlp": 0.0, "step": 2210, "time_per_iteration": 2.9028308391571045 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103179, "balance_loss_mlp": 1.09162724, "diversity_loss_mlp": 0.0, "epoch": 0.4253559061177376, "flos": 650608796160.0, "grad_norm": 0.07646392549286844, "language_loss": 0.81526744, "learning_rate": 0.0006431473573488937, "loss": 0.82629919, "num_input_tokens_seen": 183966384, "router_z_loss_mlp": 0.11547852, "routerloss_mlp": 0.0, "step": 2211, "time_per_iteration": 2.7377443313598633 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089807, "balance_loss_mlp": 1.0782795, "diversity_loss_mlp": 0.0, "epoch": 0.42554828780300114, "flos": 554155333632.0, "grad_norm": 0.08107145257136338, "language_loss": 0.85147351, "learning_rate": 0.0006428488279317765, "loss": 0.86237156, "num_input_tokens_seen": 184031728, "router_z_loss_mlp": 0.11523438, "routerloss_mlp": 0.0, "step": 2212, "time_per_iteration": 2.6276626586914062 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109177, "balance_loss_mlp": 1.08065951, "diversity_loss_mlp": 0.0, "epoch": 0.4257406694882647, "flos": 514407200256.0, "grad_norm": 0.09124161172132733, "language_loss": 0.87490094, "learning_rate": 0.0006425502430558259, "loss": 0.88581866, "num_input_tokens_seen": 184096160, "router_z_loss_mlp": 0.11120605, "routerloss_mlp": 0.0, "step": 2213, "time_per_iteration": 2.588928699493408 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109161, "balance_loss_mlp": 1.08046961, "diversity_loss_mlp": 0.0, "epoch": 0.42593305117352825, "flos": 515645015040.0, "grad_norm": 0.06865062693642494, "language_loss": 0.84588826, "learning_rate": 0.0006422516028369628, "loss": 0.85680431, "num_input_tokens_seen": 184169664, "router_z_loss_mlp": 0.11138916, "routerloss_mlp": 0.0, "step": 2214, "time_per_iteration": 2.639619827270508 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085781, "balance_loss_mlp": 1.07456374, "diversity_loss_mlp": 0.0, "epoch": 0.42612543285879184, "flos": 588059813376.0, "grad_norm": 0.06481575152476399, "language_loss": 0.83497036, "learning_rate": 0.0006419529073911296, "loss": 0.84582818, "num_input_tokens_seen": 184249152, "router_z_loss_mlp": 0.11218262, "routerloss_mlp": 0.0, "step": 2215, "time_per_iteration": 2.8564555644989014 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091551, "balance_loss_mlp": 1.08075058, "diversity_loss_mlp": 0.0, "epoch": 0.42631781454405543, "flos": 635472345600.0, "grad_norm": 0.07537518077633425, "language_loss": 0.85102242, "learning_rate": 0.0006416541568342901, "loss": 0.86193788, "num_input_tokens_seen": 184326816, "router_z_loss_mlp": 0.10797119, "routerloss_mlp": 0.0, "step": 2216, "time_per_iteration": 2.8998327255249023 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082292, "balance_loss_mlp": 1.07092535, "diversity_loss_mlp": 0.0, "epoch": 0.42651019622931896, "flos": 541161202176.0, "grad_norm": 0.06331803259599181, "language_loss": 0.84347832, "learning_rate": 0.0006413553512824297, "loss": 0.85430121, "num_input_tokens_seen": 184404336, "router_z_loss_mlp": 0.1137085, "routerloss_mlp": 0.0, "step": 2217, "time_per_iteration": 2.754044532775879 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084192, "balance_loss_mlp": 1.07307625, "diversity_loss_mlp": 0.0, "epoch": 0.42670257791458255, "flos": 558158045184.0, "grad_norm": 0.07616444203019798, "language_loss": 0.84374213, "learning_rate": 0.0006410564908515549, "loss": 0.85458404, "num_input_tokens_seen": 184472320, "router_z_loss_mlp": 0.11114502, "routerloss_mlp": 0.0, "step": 2218, "time_per_iteration": 2.724478006362915 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081004, "balance_loss_mlp": 1.06966138, "diversity_loss_mlp": 0.0, "epoch": 0.4268949595998461, "flos": 621309782016.0, "grad_norm": 0.0731173396075932, "language_loss": 0.85161233, "learning_rate": 0.0006407575756576935, "loss": 0.86242241, "num_input_tokens_seen": 184544704, "router_z_loss_mlp": 0.11334229, "routerloss_mlp": 0.0, "step": 2219, "time_per_iteration": 2.754624128341675 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01093699, "balance_loss_mlp": 1.08191478, "diversity_loss_mlp": 0.0, "epoch": 0.42708734128510967, "flos": 537919460352.0, "grad_norm": 0.068521011535794, "language_loss": 0.87612599, "learning_rate": 0.0006404586058168951, "loss": 0.88706297, "num_input_tokens_seen": 184622544, "router_z_loss_mlp": 0.11773682, "routerloss_mlp": 0.0, "step": 2220, "time_per_iteration": 2.6972298622131348 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100965, "balance_loss_mlp": 1.08927631, "diversity_loss_mlp": 0.0, "epoch": 0.4272797229703732, "flos": 502865998848.0, "grad_norm": 0.1033551804820373, "language_loss": 0.86327708, "learning_rate": 0.0006401595814452296, "loss": 0.87428677, "num_input_tokens_seen": 184692544, "router_z_loss_mlp": 0.11682129, "routerloss_mlp": 0.0, "step": 2221, "time_per_iteration": 2.6071925163269043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100016, "balance_loss_mlp": 1.08816695, "diversity_loss_mlp": 0.0, "epoch": 0.4274721046556368, "flos": 492453955584.0, "grad_norm": 0.07649462730323824, "language_loss": 0.8070569, "learning_rate": 0.000639860502658789, "loss": 0.81805706, "num_input_tokens_seen": 184760480, "router_z_loss_mlp": 0.1184082, "routerloss_mlp": 0.0, "step": 2222, "time_per_iteration": 2.6844141483306885 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101843, "balance_loss_mlp": 1.08965993, "diversity_loss_mlp": 0.0, "epoch": 0.4276644863409004, "flos": 568367456256.0, "grad_norm": 0.0652732350229211, "language_loss": 0.84929889, "learning_rate": 0.0006395613695736853, "loss": 0.86031729, "num_input_tokens_seen": 184834080, "router_z_loss_mlp": 0.1217041, "routerloss_mlp": 0.0, "step": 2223, "time_per_iteration": 2.6799042224884033 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091013, "balance_loss_mlp": 1.07850194, "diversity_loss_mlp": 0.0, "epoch": 0.4278568680261639, "flos": 607436112384.0, "grad_norm": 0.10552751254703834, "language_loss": 0.82026577, "learning_rate": 0.0006392621823060529, "loss": 0.83117592, "num_input_tokens_seen": 184905872, "router_z_loss_mlp": 0.12518311, "routerloss_mlp": 0.0, "step": 2224, "time_per_iteration": 2.722675323486328 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083211, "balance_loss_mlp": 1.07109332, "diversity_loss_mlp": 0.0, "epoch": 0.4280492497114275, "flos": 560527589376.0, "grad_norm": 0.0790777786133485, "language_loss": 0.8508532, "learning_rate": 0.0006389629409720465, "loss": 0.86168534, "num_input_tokens_seen": 184972320, "router_z_loss_mlp": 0.12115479, "routerloss_mlp": 0.0, "step": 2225, "time_per_iteration": 2.6559393405914307 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084084, "balance_loss_mlp": 1.07179379, "diversity_loss_mlp": 0.0, "epoch": 0.428241631396691, "flos": 720646709760.0, "grad_norm": 0.0811747132385773, "language_loss": 0.88654399, "learning_rate": 0.0006386636456878417, "loss": 0.89738482, "num_input_tokens_seen": 185051040, "router_z_loss_mlp": 0.12298584, "routerloss_mlp": 0.0, "step": 2226, "time_per_iteration": 2.898261308670044 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083891, "balance_loss_mlp": 1.07153535, "diversity_loss_mlp": 0.0, "epoch": 0.4284340130819546, "flos": 429467774976.0, "grad_norm": 0.07696212536929578, "language_loss": 0.92413348, "learning_rate": 0.0006383642965696353, "loss": 0.93497235, "num_input_tokens_seen": 185113552, "router_z_loss_mlp": 0.12353516, "routerloss_mlp": 0.0, "step": 2227, "time_per_iteration": 2.467622995376587 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00932178, "balance_loss_mlp": 1.62005818, "diversity_loss_mlp": 0.21207821, "epoch": 0.42862639476721814, "flos": 525016733184.0, "grad_norm": 0.033827312051000154, "language_loss": 0.83018744, "learning_rate": 0.000638064893733645, "loss": 0.83950925, "num_input_tokens_seen": 185185056, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01611001, "step": 2228, "time_per_iteration": 2.74554705619812 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00939878, "balance_loss_mlp": 1.63503206, "diversity_loss_mlp": 0.21170495, "epoch": 0.42881877645248173, "flos": 465346446336.0, "grad_norm": 0.03357304306136308, "language_loss": 0.90087909, "learning_rate": 0.000637765437296109, "loss": 0.91027784, "num_input_tokens_seen": 185257248, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01650969, "step": 2229, "time_per_iteration": 2.6807308197021484 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01086799, "balance_loss_mlp": 1.07446718, "diversity_loss_mlp": 0.0, "epoch": 0.42901115813774526, "flos": 560297793024.0, "grad_norm": 0.09425394332621637, "language_loss": 0.85585725, "learning_rate": 0.000637465927373287, "loss": 0.86672527, "num_input_tokens_seen": 185324800, "router_z_loss_mlp": 0.12329102, "routerloss_mlp": 0.0, "step": 2230, "time_per_iteration": 2.6279454231262207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088133, "balance_loss_mlp": 1.0761342, "diversity_loss_mlp": 0.0, "epoch": 0.42920353982300885, "flos": 561454115328.0, "grad_norm": 0.13300209785278838, "language_loss": 0.79446864, "learning_rate": 0.000637166364081459, "loss": 0.80534995, "num_input_tokens_seen": 185393408, "router_z_loss_mlp": 0.11993408, "routerloss_mlp": 0.0, "step": 2231, "time_per_iteration": 2.7252066135406494 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108179, "balance_loss_mlp": 1.07001245, "diversity_loss_mlp": 0.0, "epoch": 0.42939592150827244, "flos": 556248093696.0, "grad_norm": 0.08046243261781533, "language_loss": 0.84081841, "learning_rate": 0.0006368667475369256, "loss": 0.85163629, "num_input_tokens_seen": 185467968, "router_z_loss_mlp": 0.11773682, "routerloss_mlp": 0.0, "step": 2232, "time_per_iteration": 2.756286382675171 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046222, "balance_loss_mlp": 1.03840148, "diversity_loss_mlp": 0.0, "epoch": 0.42958830319353597, "flos": 1521623688192.0, "grad_norm": 0.02809293853716727, "language_loss": 0.78527778, "learning_rate": 0.0006365670778560084, "loss": 0.79574001, "num_input_tokens_seen": 185705232, "router_z_loss_mlp": 0.078125, "routerloss_mlp": 0.0, "step": 2233, "time_per_iteration": 4.852276086807251 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01030619, "balance_loss_mlp": 1.02313304, "diversity_loss_mlp": 0.0, "epoch": 0.42978068487879956, "flos": 1495813837824.0, "grad_norm": 0.02329901381823612, "language_loss": 0.78895426, "learning_rate": 0.0006362673551550494, "loss": 0.79926044, "num_input_tokens_seen": 185932672, "router_z_loss_mlp": 0.07470703, "routerloss_mlp": 0.0, "step": 2234, "time_per_iteration": 4.812516689300537 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01107186, "balance_loss_mlp": 1.09534228, "diversity_loss_mlp": 0.0, "epoch": 0.4299730665640631, "flos": 546992372736.0, "grad_norm": 0.06628794940731256, "language_loss": 0.86166692, "learning_rate": 0.0006359675795504112, "loss": 0.87273884, "num_input_tokens_seen": 186006288, "router_z_loss_mlp": 0.1184082, "routerloss_mlp": 0.0, "step": 2235, "time_per_iteration": 2.7691314220428467 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112399, "balance_loss_mlp": 1.11230159, "diversity_loss_mlp": 0.0, "epoch": 0.4301654482493267, "flos": 1129293342720.0, "grad_norm": 0.08124483128316094, "language_loss": 0.74637383, "learning_rate": 0.0006356677511584775, "loss": 0.75761378, "num_input_tokens_seen": 186097168, "router_z_loss_mlp": 0.11676025, "routerloss_mlp": 0.0, "step": 2236, "time_per_iteration": 3.51676082611084 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138804, "balance_loss_mlp": 1.12733603, "diversity_loss_mlp": 0.0, "epoch": 0.4303578299345902, "flos": 495750025728.0, "grad_norm": 0.08045247853644188, "language_loss": 0.85975677, "learning_rate": 0.0006353678700956511, "loss": 0.87114477, "num_input_tokens_seen": 186163904, "router_z_loss_mlp": 0.11462402, "routerloss_mlp": 0.0, "step": 2237, "time_per_iteration": 2.5487072467803955 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01137661, "balance_loss_mlp": 1.12605572, "diversity_loss_mlp": 0.0, "epoch": 0.4305502116198538, "flos": 615762736128.0, "grad_norm": 0.08414636037035166, "language_loss": 0.84184766, "learning_rate": 0.0006350679364783569, "loss": 0.85322422, "num_input_tokens_seen": 186233888, "router_z_loss_mlp": 0.1159668, "routerloss_mlp": 0.0, "step": 2238, "time_per_iteration": 2.730128288269043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0113973, "balance_loss_mlp": 1.1279577, "diversity_loss_mlp": 0.0, "epoch": 0.4307425933051173, "flos": 559260039168.0, "grad_norm": 0.06707032645836293, "language_loss": 0.85872072, "learning_rate": 0.0006347679504230393, "loss": 0.87011802, "num_input_tokens_seen": 186301168, "router_z_loss_mlp": 0.11773682, "routerloss_mlp": 0.0, "step": 2239, "time_per_iteration": 2.640791893005371 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01136631, "balance_loss_mlp": 1.12453079, "diversity_loss_mlp": 0.0, "epoch": 0.4309349749903809, "flos": 972166344192.0, "grad_norm": 0.07174503893432663, "language_loss": 0.7626543, "learning_rate": 0.0006344679120461632, "loss": 0.77402061, "num_input_tokens_seen": 186392096, "router_z_loss_mlp": 0.12097168, "routerloss_mlp": 0.0, "step": 2240, "time_per_iteration": 3.3352768421173096 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01128316, "balance_loss_mlp": 1.11687779, "diversity_loss_mlp": 0.0, "epoch": 0.4311273566756445, "flos": 541924743168.0, "grad_norm": 0.08647233478950261, "language_loss": 0.79984182, "learning_rate": 0.0006341678214642134, "loss": 0.81112498, "num_input_tokens_seen": 186458000, "router_z_loss_mlp": 0.11431885, "routerloss_mlp": 0.0, "step": 2241, "time_per_iteration": 2.662132740020752 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01114748, "balance_loss_mlp": 1.10336995, "diversity_loss_mlp": 0.0, "epoch": 0.43131973836090803, "flos": 761674503168.0, "grad_norm": 0.06482352137494116, "language_loss": 0.82986903, "learning_rate": 0.0006338676787936963, "loss": 0.84101653, "num_input_tokens_seen": 186544992, "router_z_loss_mlp": 0.11383057, "routerloss_mlp": 0.0, "step": 2242, "time_per_iteration": 3.064518451690674 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01123318, "balance_loss_mlp": 1.11183178, "diversity_loss_mlp": 0.0, "epoch": 0.4315121200461716, "flos": 554530862592.0, "grad_norm": 0.07554467546841755, "language_loss": 0.84015846, "learning_rate": 0.0006335674841511367, "loss": 0.85139167, "num_input_tokens_seen": 186614960, "router_z_loss_mlp": 0.11480713, "routerloss_mlp": 0.0, "step": 2243, "time_per_iteration": 2.7494354248046875 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067953, "balance_loss_mlp": 1.06189752, "diversity_loss_mlp": 0.0, "epoch": 0.43170450173143515, "flos": 1485334609920.0, "grad_norm": 0.020266409588932003, "language_loss": 0.7918117, "learning_rate": 0.000633267237653081, "loss": 0.80249119, "num_input_tokens_seen": 186854288, "router_z_loss_mlp": 0.06054688, "routerloss_mlp": 0.0, "step": 2244, "time_per_iteration": 5.019898414611816 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058136, "balance_loss_mlp": 1.05208015, "diversity_loss_mlp": 0.0, "epoch": 0.43189688341669874, "flos": 1473697234944.0, "grad_norm": 0.017496917907237546, "language_loss": 0.77365553, "learning_rate": 0.0006329669394160953, "loss": 0.78423691, "num_input_tokens_seen": 187090272, "router_z_loss_mlp": 0.06054688, "routerloss_mlp": 0.0, "step": 2245, "time_per_iteration": 4.940483808517456 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111091, "balance_loss_mlp": 1.09893775, "diversity_loss_mlp": 0.0, "epoch": 0.43208926510196227, "flos": 492938141184.0, "grad_norm": 0.07826437205196314, "language_loss": 0.82487583, "learning_rate": 0.0006326665895567652, "loss": 0.83598673, "num_input_tokens_seen": 187157584, "router_z_loss_mlp": 0.121521, "routerloss_mlp": 0.0, "step": 2246, "time_per_iteration": 2.6287152767181396 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111115, "balance_loss_mlp": 1.09895015, "diversity_loss_mlp": 0.0, "epoch": 0.43228164678722586, "flos": 520235799552.0, "grad_norm": 0.09268036537549412, "language_loss": 0.87613881, "learning_rate": 0.0006323661881916976, "loss": 0.88725001, "num_input_tokens_seen": 187229408, "router_z_loss_mlp": 0.121521, "routerloss_mlp": 0.0, "step": 2247, "time_per_iteration": 2.6966464519500732 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110117, "balance_loss_mlp": 1.08901072, "diversity_loss_mlp": 0.0, "epoch": 0.4324740284724894, "flos": 796056201216.0, "grad_norm": 0.07850654458656253, "language_loss": 0.812437, "learning_rate": 0.0006320657354375179, "loss": 0.82344878, "num_input_tokens_seen": 187304384, "router_z_loss_mlp": 0.12158203, "routerloss_mlp": 0.0, "step": 2248, "time_per_iteration": 3.0057384967803955 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100829, "balance_loss_mlp": 1.08872366, "diversity_loss_mlp": 0.0, "epoch": 0.432666410157753, "flos": 482153140224.0, "grad_norm": 0.07399569527983862, "language_loss": 0.87203169, "learning_rate": 0.0006317652314108726, "loss": 0.88303995, "num_input_tokens_seen": 187368064, "router_z_loss_mlp": 0.12097168, "routerloss_mlp": 0.0, "step": 2249, "time_per_iteration": 2.6106557846069336 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083093, "balance_loss_mlp": 1.07126176, "diversity_loss_mlp": 0.0, "epoch": 0.43285879184301657, "flos": 500212329984.0, "grad_norm": 0.07131076511794647, "language_loss": 0.91191232, "learning_rate": 0.0006314646762284277, "loss": 0.92274326, "num_input_tokens_seen": 187436320, "router_z_loss_mlp": 0.11816406, "routerloss_mlp": 0.0, "step": 2250, "time_per_iteration": 2.601017951965332 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01032846, "balance_loss_mlp": 1.02617049, "diversity_loss_mlp": 0.0, "epoch": 0.4330511735282801, "flos": 1510448103936.0, "grad_norm": 0.02997957544407836, "language_loss": 0.75425828, "learning_rate": 0.0006311640700068691, "loss": 0.76458681, "num_input_tokens_seen": 187670912, "router_z_loss_mlp": 0.06689453, "routerloss_mlp": 0.0, "step": 2251, "time_per_iteration": 4.872025966644287 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085431, "balance_loss_mlp": 1.07351613, "diversity_loss_mlp": 0.0, "epoch": 0.4332435552135437, "flos": 699582915072.0, "grad_norm": 0.07162967916255573, "language_loss": 0.77412337, "learning_rate": 0.0006308634128629022, "loss": 0.78497767, "num_input_tokens_seen": 187746432, "router_z_loss_mlp": 0.11907959, "routerloss_mlp": 0.0, "step": 2252, "time_per_iteration": 2.858896255493164 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089815, "balance_loss_mlp": 1.07750654, "diversity_loss_mlp": 0.0, "epoch": 0.4334359368988072, "flos": 592292321280.0, "grad_norm": 0.0655401202696214, "language_loss": 0.8742274, "learning_rate": 0.0006305627049132531, "loss": 0.88512552, "num_input_tokens_seen": 187820032, "router_z_loss_mlp": 0.12298584, "routerloss_mlp": 0.0, "step": 2253, "time_per_iteration": 2.8089702129364014 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108552, "balance_loss_mlp": 1.07309866, "diversity_loss_mlp": 0.0, "epoch": 0.4336283185840708, "flos": 842806508544.0, "grad_norm": 0.05577202062379855, "language_loss": 0.85968709, "learning_rate": 0.0006302619462746662, "loss": 0.87054229, "num_input_tokens_seen": 187904400, "router_z_loss_mlp": 0.12414551, "routerloss_mlp": 0.0, "step": 2254, "time_per_iteration": 3.117469072341919 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090126, "balance_loss_mlp": 1.07842588, "diversity_loss_mlp": 0.0, "epoch": 0.43382070026933434, "flos": 626258843136.0, "grad_norm": 0.07095559842956704, "language_loss": 0.90230805, "learning_rate": 0.0006299611370639069, "loss": 0.91320932, "num_input_tokens_seen": 187973264, "router_z_loss_mlp": 0.11688232, "routerloss_mlp": 0.0, "step": 2255, "time_per_iteration": 2.723188638687134 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084284, "balance_loss_mlp": 1.07239318, "diversity_loss_mlp": 0.0, "epoch": 0.4340130819545979, "flos": 591111406080.0, "grad_norm": 0.07367301477096526, "language_loss": 0.79524988, "learning_rate": 0.0006296602773977593, "loss": 0.80609274, "num_input_tokens_seen": 188039984, "router_z_loss_mlp": 0.11889648, "routerloss_mlp": 0.0, "step": 2256, "time_per_iteration": 2.6743130683898926 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099708, "balance_loss_mlp": 1.08790588, "diversity_loss_mlp": 0.0, "epoch": 0.4342054636398615, "flos": 490889797632.0, "grad_norm": 0.06301035546935001, "language_loss": 0.87406039, "learning_rate": 0.0006293593673930277, "loss": 0.88505745, "num_input_tokens_seen": 188113456, "router_z_loss_mlp": 0.11791992, "routerloss_mlp": 0.0, "step": 2257, "time_per_iteration": 2.6397616863250732 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103769, "balance_loss_mlp": 1.09211683, "diversity_loss_mlp": 0.0, "epoch": 0.43439784532512504, "flos": 698994842112.0, "grad_norm": 0.07716264473653381, "language_loss": 0.78774142, "learning_rate": 0.0006290584071665358, "loss": 0.79877913, "num_input_tokens_seen": 188192480, "router_z_loss_mlp": 0.11639404, "routerloss_mlp": 0.0, "step": 2258, "time_per_iteration": 2.9148640632629395 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088266, "balance_loss_mlp": 1.07634544, "diversity_loss_mlp": 0.0, "epoch": 0.43459022701038863, "flos": 485824739328.0, "grad_norm": 0.06859255861010008, "language_loss": 0.82309216, "learning_rate": 0.0006287573968351266, "loss": 0.83397484, "num_input_tokens_seen": 188258784, "router_z_loss_mlp": 0.11914062, "routerloss_mlp": 0.0, "step": 2259, "time_per_iteration": 2.582099437713623 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081836, "balance_loss_mlp": 1.06989694, "diversity_loss_mlp": 0.0, "epoch": 0.43478260869565216, "flos": 643107382272.0, "grad_norm": 0.0728512329620832, "language_loss": 0.8210361, "learning_rate": 0.0006284563365156626, "loss": 0.83185446, "num_input_tokens_seen": 188331312, "router_z_loss_mlp": 0.11938477, "routerloss_mlp": 0.0, "step": 2260, "time_per_iteration": 2.802004814147949 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075245, "balance_loss_mlp": 1.06343079, "diversity_loss_mlp": 0.0, "epoch": 0.43497499038091575, "flos": 426097552896.0, "grad_norm": 0.08318375282180102, "language_loss": 0.87862843, "learning_rate": 0.0006281552263250261, "loss": 0.88938093, "num_input_tokens_seen": 188393712, "router_z_loss_mlp": 0.11810303, "routerloss_mlp": 0.0, "step": 2261, "time_per_iteration": 2.5335495471954346 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0103451, "balance_loss_mlp": 1.02721453, "diversity_loss_mlp": 0.0, "epoch": 0.4351673720661793, "flos": 1538378625024.0, "grad_norm": 0.02511862566194507, "language_loss": 0.80691534, "learning_rate": 0.000627854066380118, "loss": 0.81726044, "num_input_tokens_seen": 188621152, "router_z_loss_mlp": 0.07275391, "routerloss_mlp": 0.0, "step": 2262, "time_per_iteration": 4.858395338058472 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067582, "balance_loss_mlp": 1.05593562, "diversity_loss_mlp": 0.0, "epoch": 0.43535975375144287, "flos": 749155018752.0, "grad_norm": 0.07030760098393707, "language_loss": 0.81181604, "learning_rate": 0.0006275528567978593, "loss": 0.82249182, "num_input_tokens_seen": 188697120, "router_z_loss_mlp": 0.11633301, "routerloss_mlp": 0.0, "step": 2263, "time_per_iteration": 2.9562113285064697 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106752, "balance_loss_mlp": 1.05570674, "diversity_loss_mlp": 0.0, "epoch": 0.4355521354367064, "flos": 861280874496.0, "grad_norm": 0.09515047383985015, "language_loss": 0.82464182, "learning_rate": 0.0006272515976951898, "loss": 0.83531702, "num_input_tokens_seen": 188778480, "router_z_loss_mlp": 0.11804199, "routerloss_mlp": 0.0, "step": 2264, "time_per_iteration": 3.0750486850738525 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106778, "balance_loss_mlp": 1.05625236, "diversity_loss_mlp": 0.0, "epoch": 0.43574451712197, "flos": 734527719936.0, "grad_norm": 0.06538835415995116, "language_loss": 0.7903443, "learning_rate": 0.0006269502891890687, "loss": 0.80102211, "num_input_tokens_seen": 188863616, "router_z_loss_mlp": 0.11523438, "routerloss_mlp": 0.0, "step": 2265, "time_per_iteration": 3.0723042488098145 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069278, "balance_loss_mlp": 1.05721438, "diversity_loss_mlp": 0.0, "epoch": 0.4359368988072336, "flos": 570578784768.0, "grad_norm": 0.06791130510000161, "language_loss": 0.88071477, "learning_rate": 0.0006266489313964743, "loss": 0.89140749, "num_input_tokens_seen": 188933984, "router_z_loss_mlp": 0.12060547, "routerloss_mlp": 0.0, "step": 2266, "time_per_iteration": 2.7362618446350098 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00937641, "balance_loss_mlp": 1.63294578, "diversity_loss_mlp": 0.21328503, "epoch": 0.4361292804924971, "flos": 555528969216.0, "grad_norm": 0.028233172977391998, "language_loss": 0.85207379, "learning_rate": 0.0006263475244344041, "loss": 0.8614502, "num_input_tokens_seen": 189012976, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01452552, "step": 2267, "time_per_iteration": 2.8842954635620117 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082065, "balance_loss_mlp": 1.06979251, "diversity_loss_mlp": 0.0, "epoch": 0.4363216621777607, "flos": 557285847552.0, "grad_norm": 0.07502115173737808, "language_loss": 0.84271002, "learning_rate": 0.0006260460684198746, "loss": 0.8535307, "num_input_tokens_seen": 189079664, "router_z_loss_mlp": 0.12268066, "routerloss_mlp": 0.0, "step": 2268, "time_per_iteration": 2.6355533599853516 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089831, "balance_loss_mlp": 1.07749879, "diversity_loss_mlp": 0.0, "epoch": 0.4365140438630242, "flos": 478222009344.0, "grad_norm": 0.07640014386484298, "language_loss": 0.84040511, "learning_rate": 0.0006257445634699213, "loss": 0.85130346, "num_input_tokens_seen": 189144688, "router_z_loss_mlp": 0.12322998, "routerloss_mlp": 0.0, "step": 2269, "time_per_iteration": 2.5279150009155273 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089112, "balance_loss_mlp": 1.07683921, "diversity_loss_mlp": 0.0, "epoch": 0.4367064255482878, "flos": 578917891584.0, "grad_norm": 0.16142331523875347, "language_loss": 0.83037758, "learning_rate": 0.0006254430097015993, "loss": 0.84126872, "num_input_tokens_seen": 189213984, "router_z_loss_mlp": 0.12268066, "routerloss_mlp": 0.0, "step": 2270, "time_per_iteration": 2.660228729248047 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01037647, "balance_loss_mlp": 1.03087568, "diversity_loss_mlp": 0.0, "epoch": 0.43689880723355135, "flos": 1458946225152.0, "grad_norm": 0.024589935077845904, "language_loss": 0.76479089, "learning_rate": 0.0006251414072319815, "loss": 0.77516735, "num_input_tokens_seen": 189434416, "router_z_loss_mlp": 0.06787109, "routerloss_mlp": 0.0, "step": 2271, "time_per_iteration": 4.794579744338989 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070138, "balance_loss_mlp": 1.05796623, "diversity_loss_mlp": 0.0, "epoch": 0.43709118891881493, "flos": 667610408448.0, "grad_norm": 0.057648382072647573, "language_loss": 0.85053569, "learning_rate": 0.0006248397561781609, "loss": 0.86123705, "num_input_tokens_seen": 189513248, "router_z_loss_mlp": 0.12164307, "routerloss_mlp": 0.0, "step": 2272, "time_per_iteration": 2.862569570541382 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067836, "balance_loss_mlp": 1.05557537, "diversity_loss_mlp": 0.0, "epoch": 0.43728357060407846, "flos": 544872448512.0, "grad_norm": 0.08840424380788836, "language_loss": 0.86255217, "learning_rate": 0.0006245380566572482, "loss": 0.87323052, "num_input_tokens_seen": 189585392, "router_z_loss_mlp": 0.12255859, "routerloss_mlp": 0.0, "step": 2273, "time_per_iteration": 2.7386484146118164 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068942, "balance_loss_mlp": 1.0566572, "diversity_loss_mlp": 0.0, "epoch": 0.43747595228934205, "flos": 746839802880.0, "grad_norm": 0.07723857249852564, "language_loss": 0.75794655, "learning_rate": 0.0006242363087863744, "loss": 0.76863599, "num_input_tokens_seen": 189667552, "router_z_loss_mlp": 0.12286377, "routerloss_mlp": 0.0, "step": 2274, "time_per_iteration": 2.948030710220337 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010675, "balance_loss_mlp": 1.05560887, "diversity_loss_mlp": 0.0, "epoch": 0.43766833397460564, "flos": 631353636864.0, "grad_norm": 0.06687985923679116, "language_loss": 0.86043644, "learning_rate": 0.0006239345126826878, "loss": 0.87111151, "num_input_tokens_seen": 189742048, "router_z_loss_mlp": 0.11889648, "routerloss_mlp": 0.0, "step": 2275, "time_per_iteration": 2.787750482559204 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071289, "balance_loss_mlp": 1.05926108, "diversity_loss_mlp": 0.0, "epoch": 0.43786071565986917, "flos": 530986295808.0, "grad_norm": 0.07503499995760528, "language_loss": 0.83946115, "learning_rate": 0.0006236326684633561, "loss": 0.85017407, "num_input_tokens_seen": 189817968, "router_z_loss_mlp": 0.12017822, "routerloss_mlp": 0.0, "step": 2276, "time_per_iteration": 2.8109841346740723 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071151, "balance_loss_mlp": 1.05921769, "diversity_loss_mlp": 0.0, "epoch": 0.43805309734513276, "flos": 538547180544.0, "grad_norm": 0.08049471875944368, "language_loss": 0.75253642, "learning_rate": 0.0006233307762455658, "loss": 0.76324785, "num_input_tokens_seen": 189882608, "router_z_loss_mlp": 0.11932373, "routerloss_mlp": 0.0, "step": 2277, "time_per_iteration": 2.632291793823242 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072177, "balance_loss_mlp": 1.06043518, "diversity_loss_mlp": 0.0, "epoch": 0.4382454790303963, "flos": 864542439936.0, "grad_norm": 0.0727539933311737, "language_loss": 0.83312476, "learning_rate": 0.0006230288361465216, "loss": 0.8438465, "num_input_tokens_seen": 189960608, "router_z_loss_mlp": 0.11730957, "routerloss_mlp": 0.0, "step": 2278, "time_per_iteration": 3.060615062713623 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106943, "balance_loss_mlp": 1.05752659, "diversity_loss_mlp": 0.0, "epoch": 0.4384378607156599, "flos": 765499548672.0, "grad_norm": 0.08745359184854619, "language_loss": 0.84888816, "learning_rate": 0.0006227268482834473, "loss": 0.85958248, "num_input_tokens_seen": 190035472, "router_z_loss_mlp": 0.11889648, "routerloss_mlp": 0.0, "step": 2279, "time_per_iteration": 2.9116861820220947 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00929134, "balance_loss_mlp": 1.61467147, "diversity_loss_mlp": 0.21327347, "epoch": 0.4386302424009234, "flos": 668566669824.0, "grad_norm": 0.03053717197724305, "language_loss": 0.8733198, "learning_rate": 0.000622424812773585, "loss": 0.88261116, "num_input_tokens_seen": 190109312, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0151619, "step": 2280, "time_per_iteration": 2.83655047416687 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087033, "balance_loss_mlp": 1.07515955, "diversity_loss_mlp": 0.0, "epoch": 0.438822624086187, "flos": 485182338048.0, "grad_norm": 0.09030781332224262, "language_loss": 0.8003484, "learning_rate": 0.000622122729734195, "loss": 0.81121874, "num_input_tokens_seen": 190174176, "router_z_loss_mlp": 0.11871338, "routerloss_mlp": 0.0, "step": 2281, "time_per_iteration": 2.598515033721924 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088881, "balance_loss_mlp": 1.07746708, "diversity_loss_mlp": 0.0, "epoch": 0.4390150057714506, "flos": 499218992640.0, "grad_norm": 0.05965815533468205, "language_loss": 0.87430406, "learning_rate": 0.0006218205992825566, "loss": 0.88519287, "num_input_tokens_seen": 190243888, "router_z_loss_mlp": 0.11413574, "routerloss_mlp": 0.0, "step": 2282, "time_per_iteration": 2.6424663066864014 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084394, "balance_loss_mlp": 1.07271123, "diversity_loss_mlp": 0.0, "epoch": 0.4392073874567141, "flos": 558219714048.0, "grad_norm": 0.06483845116972914, "language_loss": 0.81733787, "learning_rate": 0.0006215184215359671, "loss": 0.8281818, "num_input_tokens_seen": 190317504, "router_z_loss_mlp": 0.11688232, "routerloss_mlp": 0.0, "step": 2283, "time_per_iteration": 2.736311674118042 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087917, "balance_loss_mlp": 1.07662153, "diversity_loss_mlp": 0.0, "epoch": 0.4393997691419777, "flos": 605306276352.0, "grad_norm": 0.0656289826640407, "language_loss": 0.86697561, "learning_rate": 0.0006212161966117425, "loss": 0.8778547, "num_input_tokens_seen": 190390160, "router_z_loss_mlp": 0.11297607, "routerloss_mlp": 0.0, "step": 2284, "time_per_iteration": 2.727402448654175 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091514, "balance_loss_mlp": 1.07989156, "diversity_loss_mlp": 0.0, "epoch": 0.43959215082724123, "flos": 804145688064.0, "grad_norm": 0.07463232969806483, "language_loss": 0.81628394, "learning_rate": 0.0006209139246272164, "loss": 0.8271991, "num_input_tokens_seen": 190467600, "router_z_loss_mlp": 0.11621094, "routerloss_mlp": 0.0, "step": 2285, "time_per_iteration": 2.978759527206421 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01093878, "balance_loss_mlp": 1.08205843, "diversity_loss_mlp": 0.0, "epoch": 0.4397845325125048, "flos": 487643286528.0, "grad_norm": 0.08236326374350296, "language_loss": 0.81938732, "learning_rate": 0.0006206116056997421, "loss": 0.83032608, "num_input_tokens_seen": 190534192, "router_z_loss_mlp": 0.1182251, "routerloss_mlp": 0.0, "step": 2286, "time_per_iteration": 2.6111207008361816 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085811, "balance_loss_mlp": 1.07444477, "diversity_loss_mlp": 0.0, "epoch": 0.43997691419776835, "flos": 480811438080.0, "grad_norm": 0.06662472973472185, "language_loss": 0.82727671, "learning_rate": 0.0006203092399466892, "loss": 0.83813483, "num_input_tokens_seen": 190601440, "router_z_loss_mlp": 0.1137085, "routerloss_mlp": 0.0, "step": 2287, "time_per_iteration": 2.6246864795684814 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109279, "balance_loss_mlp": 1.08137023, "diversity_loss_mlp": 0.0, "epoch": 0.44016929588303194, "flos": 483124082688.0, "grad_norm": 0.06470350083987941, "language_loss": 0.85380936, "learning_rate": 0.0006200068274854473, "loss": 0.86473733, "num_input_tokens_seen": 190672528, "router_z_loss_mlp": 0.11419678, "routerloss_mlp": 0.0, "step": 2288, "time_per_iteration": 2.675197124481201 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091416, "balance_loss_mlp": 1.07988858, "diversity_loss_mlp": 0.0, "epoch": 0.4403616775682955, "flos": 571853675520.0, "grad_norm": 0.0650031810595099, "language_loss": 0.8588661, "learning_rate": 0.0006197043684334229, "loss": 0.86978024, "num_input_tokens_seen": 190750704, "router_z_loss_mlp": 0.11523438, "routerloss_mlp": 0.0, "step": 2289, "time_per_iteration": 2.787095785140991 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092106, "balance_loss_mlp": 1.08063841, "diversity_loss_mlp": 0.0, "epoch": 0.44055405925355906, "flos": 630849627648.0, "grad_norm": 0.0715970788084748, "language_loss": 0.79333103, "learning_rate": 0.0006194018629080411, "loss": 0.80425215, "num_input_tokens_seen": 190821664, "router_z_loss_mlp": 0.11462402, "routerloss_mlp": 0.0, "step": 2290, "time_per_iteration": 2.817836284637451 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103028, "balance_loss_mlp": 1.09150028, "diversity_loss_mlp": 0.0, "epoch": 0.44074644093882265, "flos": 536782961664.0, "grad_norm": 0.07061114258803743, "language_loss": 0.81714827, "learning_rate": 0.0006190993110267451, "loss": 0.82817852, "num_input_tokens_seen": 190893888, "router_z_loss_mlp": 0.11523438, "routerloss_mlp": 0.0, "step": 2291, "time_per_iteration": 2.741288900375366 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108227, "balance_loss_mlp": 1.09614503, "diversity_loss_mlp": 0.0, "epoch": 0.4409388226240862, "flos": 463229093376.0, "grad_norm": 0.07455801894128893, "language_loss": 0.84193838, "learning_rate": 0.0006187967129069958, "loss": 0.85302061, "num_input_tokens_seen": 190956800, "router_z_loss_mlp": 0.12084961, "routerloss_mlp": 0.0, "step": 2292, "time_per_iteration": 2.5778286457061768 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106682, "balance_loss_mlp": 1.09472573, "diversity_loss_mlp": 0.0, "epoch": 0.44113120430934977, "flos": 566005252608.0, "grad_norm": 0.06400814904414545, "language_loss": 0.8690064, "learning_rate": 0.0006184940686662722, "loss": 0.88007319, "num_input_tokens_seen": 191032048, "router_z_loss_mlp": 0.11950684, "routerloss_mlp": 0.0, "step": 2293, "time_per_iteration": 2.7292487621307373 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111022, "balance_loss_mlp": 1.09812045, "diversity_loss_mlp": 0.0, "epoch": 0.4413235859946133, "flos": 543585074688.0, "grad_norm": 0.06813451942076464, "language_loss": 0.90379488, "learning_rate": 0.0006181913784220714, "loss": 0.91489702, "num_input_tokens_seen": 191099952, "router_z_loss_mlp": 0.12091064, "routerloss_mlp": 0.0, "step": 2294, "time_per_iteration": 2.6506428718566895 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081962, "balance_loss_mlp": 1.0750953, "diversity_loss_mlp": 0.0, "epoch": 0.4415159676798769, "flos": 1569871342080.0, "grad_norm": 0.029819366941177792, "language_loss": 0.80553782, "learning_rate": 0.0006178886422919078, "loss": 0.81635749, "num_input_tokens_seen": 191335968, "router_z_loss_mlp": 0.06884766, "routerloss_mlp": 0.0, "step": 2295, "time_per_iteration": 4.882002592086792 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110182, "balance_loss_mlp": 1.09772444, "diversity_loss_mlp": 0.0, "epoch": 0.4417083493651404, "flos": 658740128256.0, "grad_norm": 0.07012194180041048, "language_loss": 0.7971437, "learning_rate": 0.0006175858603933146, "loss": 0.80824548, "num_input_tokens_seen": 191410112, "router_z_loss_mlp": 0.12469482, "routerloss_mlp": 0.0, "step": 2296, "time_per_iteration": 2.8836371898651123 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00908854, "balance_loss_mlp": 1.58032632, "diversity_loss_mlp": 0.2095283, "epoch": 0.441900731050404, "flos": 740457635328.0, "grad_norm": 0.03267646081870075, "language_loss": 0.80986243, "learning_rate": 0.0006172830328438416, "loss": 0.81895095, "num_input_tokens_seen": 191491552, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01392685, "step": 2297, "time_per_iteration": 2.9758472442626953 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01093338, "balance_loss_mlp": 1.0806725, "diversity_loss_mlp": 0.0, "epoch": 0.44209311273566754, "flos": 539441399808.0, "grad_norm": 0.0684627092891604, "language_loss": 0.86739677, "learning_rate": 0.0006169801597610572, "loss": 0.87833017, "num_input_tokens_seen": 191567872, "router_z_loss_mlp": 0.12670898, "routerloss_mlp": 0.0, "step": 2298, "time_per_iteration": 2.796999454498291 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080861, "balance_loss_mlp": 1.06855834, "diversity_loss_mlp": 0.0, "epoch": 0.4422854944209311, "flos": 621613730304.0, "grad_norm": 0.09148837874044675, "language_loss": 0.89672303, "learning_rate": 0.0006166772412625469, "loss": 0.90753162, "num_input_tokens_seen": 191638032, "router_z_loss_mlp": 0.12304688, "routerloss_mlp": 0.0, "step": 2299, "time_per_iteration": 2.719217300415039 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079493, "balance_loss_mlp": 1.06674969, "diversity_loss_mlp": 0.0, "epoch": 0.4424778761061947, "flos": 658824192000.0, "grad_norm": 0.0806717243265584, "language_loss": 0.81995088, "learning_rate": 0.0006163742774659141, "loss": 0.83074582, "num_input_tokens_seen": 191709104, "router_z_loss_mlp": 0.12744141, "routerloss_mlp": 0.0, "step": 2300, "time_per_iteration": 2.857851266860962 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082582, "balance_loss_mlp": 1.07051837, "diversity_loss_mlp": 0.0, "epoch": 0.44267025779145824, "flos": 568577428992.0, "grad_norm": 0.07368324051857801, "language_loss": 0.85920924, "learning_rate": 0.0006160712684887801, "loss": 0.87003505, "num_input_tokens_seen": 191787072, "router_z_loss_mlp": 0.1206665, "routerloss_mlp": 0.0, "step": 2301, "time_per_iteration": 2.7615816593170166 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076648, "balance_loss_mlp": 1.06491232, "diversity_loss_mlp": 0.0, "epoch": 0.44286263947672183, "flos": 496738220544.0, "grad_norm": 0.07775198871362894, "language_loss": 0.81987381, "learning_rate": 0.0006157682144487832, "loss": 0.83064032, "num_input_tokens_seen": 191863040, "router_z_loss_mlp": 0.11730957, "routerloss_mlp": 0.0, "step": 2302, "time_per_iteration": 2.759446620941162 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071769, "balance_loss_mlp": 1.05998516, "diversity_loss_mlp": 0.0, "epoch": 0.44305502116198536, "flos": 609397820928.0, "grad_norm": 0.07391427816126875, "language_loss": 0.82887244, "learning_rate": 0.0006154651154635793, "loss": 0.83959019, "num_input_tokens_seen": 191940352, "router_z_loss_mlp": 0.11779785, "routerloss_mlp": 0.0, "step": 2303, "time_per_iteration": 2.8566582202911377 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074582, "balance_loss_mlp": 1.0627867, "diversity_loss_mlp": 0.0, "epoch": 0.44324740284724895, "flos": 470794747392.0, "grad_norm": 0.07276664214775759, "language_loss": 0.84800553, "learning_rate": 0.0006151619716508421, "loss": 0.85875136, "num_input_tokens_seen": 192006896, "router_z_loss_mlp": 0.11791992, "routerloss_mlp": 0.0, "step": 2304, "time_per_iteration": 2.678624153137207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070842, "balance_loss_mlp": 1.05890322, "diversity_loss_mlp": 0.0, "epoch": 0.4434397845325125, "flos": 578725171200.0, "grad_norm": 0.0708190445963316, "language_loss": 0.87117589, "learning_rate": 0.0006148587831282625, "loss": 0.88188434, "num_input_tokens_seen": 192075312, "router_z_loss_mlp": 0.11920166, "routerloss_mlp": 0.0, "step": 2305, "time_per_iteration": 2.6833643913269043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065001, "balance_loss_mlp": 1.05813479, "diversity_loss_mlp": 0.0, "epoch": 0.44363216621777607, "flos": 1496608939008.0, "grad_norm": 0.03167846404368131, "language_loss": 0.79176068, "learning_rate": 0.0006145555500135483, "loss": 0.80241072, "num_input_tokens_seen": 192304816, "router_z_loss_mlp": 0.06884766, "routerloss_mlp": 0.0, "step": 2306, "time_per_iteration": 4.908214092254639 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074681, "balance_loss_mlp": 1.06202734, "diversity_loss_mlp": 0.0, "epoch": 0.44382454790303966, "flos": 477322647552.0, "grad_norm": 0.10781991147306623, "language_loss": 0.87386847, "learning_rate": 0.0006142522724244255, "loss": 0.8846153, "num_input_tokens_seen": 192369232, "router_z_loss_mlp": 0.12664795, "routerloss_mlp": 0.0, "step": 2307, "time_per_iteration": 2.559011459350586 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01039977, "balance_loss_mlp": 1.03301477, "diversity_loss_mlp": 0.0, "epoch": 0.4440169295883032, "flos": 1544115820032.0, "grad_norm": 0.019467834986953515, "language_loss": 0.76484716, "learning_rate": 0.0006139489504786368, "loss": 0.77524698, "num_input_tokens_seen": 192600176, "router_z_loss_mlp": 0.06982422, "routerloss_mlp": 0.0, "step": 2308, "time_per_iteration": 4.990226984024048 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010956, "balance_loss_mlp": 1.08379281, "diversity_loss_mlp": 0.0, "epoch": 0.4442093112735668, "flos": 591089011200.0, "grad_norm": 0.134173965781989, "language_loss": 0.77330542, "learning_rate": 0.000613645584293942, "loss": 0.78426147, "num_input_tokens_seen": 192675424, "router_z_loss_mlp": 0.11798096, "routerloss_mlp": 0.0, "step": 2309, "time_per_iteration": 2.925625801086426 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01096392, "balance_loss_mlp": 1.08444726, "diversity_loss_mlp": 0.0, "epoch": 0.4444016929588303, "flos": 530272313856.0, "grad_norm": 0.07260585347328512, "language_loss": 0.83497787, "learning_rate": 0.0006133421739881185, "loss": 0.84594172, "num_input_tokens_seen": 192747552, "router_z_loss_mlp": 0.11938477, "routerloss_mlp": 0.0, "step": 2310, "time_per_iteration": 2.6521387100219727 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105727, "balance_loss_mlp": 1.09360933, "diversity_loss_mlp": 0.0, "epoch": 0.4445940746440939, "flos": 620234952192.0, "grad_norm": 0.08716252058009813, "language_loss": 0.82747865, "learning_rate": 0.0006130387196789605, "loss": 0.8385359, "num_input_tokens_seen": 192819984, "router_z_loss_mlp": 0.12115479, "routerloss_mlp": 0.0, "step": 2311, "time_per_iteration": 2.7266759872436523 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100575, "balance_loss_mlp": 1.08809423, "diversity_loss_mlp": 0.0, "epoch": 0.4447864563293574, "flos": 629100089856.0, "grad_norm": 0.057672451626414926, "language_loss": 0.84308195, "learning_rate": 0.0006127352214842795, "loss": 0.85408771, "num_input_tokens_seen": 192906080, "router_z_loss_mlp": 0.12493896, "routerloss_mlp": 0.0, "step": 2312, "time_per_iteration": 2.9728119373321533 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01104798, "balance_loss_mlp": 1.09263897, "diversity_loss_mlp": 0.0, "epoch": 0.444978838014621, "flos": 650838592512.0, "grad_norm": 0.09124128780751645, "language_loss": 0.85551131, "learning_rate": 0.0006124316795219041, "loss": 0.86655927, "num_input_tokens_seen": 192972336, "router_z_loss_mlp": 0.12158203, "routerloss_mlp": 0.0, "step": 2313, "time_per_iteration": 2.793999671936035 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098642, "balance_loss_mlp": 1.08649504, "diversity_loss_mlp": 0.0, "epoch": 0.44517121969988455, "flos": 612439501824.0, "grad_norm": 0.07392199689713573, "language_loss": 0.82170153, "learning_rate": 0.0006121280939096794, "loss": 0.83268797, "num_input_tokens_seen": 193045744, "router_z_loss_mlp": 0.12145996, "routerloss_mlp": 0.0, "step": 2314, "time_per_iteration": 2.7882213592529297 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087686, "balance_loss_mlp": 1.07496047, "diversity_loss_mlp": 0.0, "epoch": 0.44536360138514813, "flos": 488735368704.0, "grad_norm": 0.07188819518398708, "language_loss": 0.87831259, "learning_rate": 0.000611824464765468, "loss": 0.88918942, "num_input_tokens_seen": 193115248, "router_z_loss_mlp": 0.12738037, "routerloss_mlp": 0.0, "step": 2315, "time_per_iteration": 2.570239305496216 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01041791, "balance_loss_mlp": 1.03435254, "diversity_loss_mlp": 0.0, "epoch": 0.4455559830704117, "flos": 1516148969472.0, "grad_norm": 0.031544046963938845, "language_loss": 0.78594941, "learning_rate": 0.0006115207922071492, "loss": 0.79636735, "num_input_tokens_seen": 193330816, "router_z_loss_mlp": 0.07421875, "routerloss_mlp": 0.0, "step": 2316, "time_per_iteration": 4.63933539390564 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107211, "balance_loss_mlp": 1.05995071, "diversity_loss_mlp": 0.0, "epoch": 0.44574836475567525, "flos": 615614432256.0, "grad_norm": 0.10006595419905694, "language_loss": 0.85561663, "learning_rate": 0.000611217076352619, "loss": 0.86633772, "num_input_tokens_seen": 193407616, "router_z_loss_mlp": 0.12158203, "routerloss_mlp": 0.0, "step": 2317, "time_per_iteration": 2.763282299041748 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068374, "balance_loss_mlp": 1.05613708, "diversity_loss_mlp": 0.0, "epoch": 0.44594074644093884, "flos": 506342306304.0, "grad_norm": 0.07080250397958886, "language_loss": 0.8323034, "learning_rate": 0.0006109133173197905, "loss": 0.84298718, "num_input_tokens_seen": 193482624, "router_z_loss_mlp": 0.12237549, "routerloss_mlp": 0.0, "step": 2318, "time_per_iteration": 2.7228074073791504 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067061, "balance_loss_mlp": 1.0546751, "diversity_loss_mlp": 0.0, "epoch": 0.44613312812620237, "flos": 726979318272.0, "grad_norm": 0.07919775459104113, "language_loss": 0.85392821, "learning_rate": 0.0006106095152265935, "loss": 0.86459887, "num_input_tokens_seen": 193555952, "router_z_loss_mlp": 0.12390137, "routerloss_mlp": 0.0, "step": 2319, "time_per_iteration": 2.950333595275879 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067661, "balance_loss_mlp": 1.05547166, "diversity_loss_mlp": 0.0, "epoch": 0.44632550981146596, "flos": 635746558464.0, "grad_norm": 0.061336847968553085, "language_loss": 0.84789562, "learning_rate": 0.0006103056701909739, "loss": 0.85857224, "num_input_tokens_seen": 193636672, "router_z_loss_mlp": 0.12176514, "routerloss_mlp": 0.0, "step": 2320, "time_per_iteration": 2.9283788204193115 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076472, "balance_loss_mlp": 1.06437278, "diversity_loss_mlp": 0.0, "epoch": 0.4465178914967295, "flos": 827074644480.0, "grad_norm": 0.06696737396207848, "language_loss": 0.83276129, "learning_rate": 0.0006100017823308956, "loss": 0.84352595, "num_input_tokens_seen": 193721728, "router_z_loss_mlp": 0.12078857, "routerloss_mlp": 0.0, "step": 2321, "time_per_iteration": 3.159337282180786 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072799, "balance_loss_mlp": 1.06091988, "diversity_loss_mlp": 0.0, "epoch": 0.4467102731819931, "flos": 665831508480.0, "grad_norm": 0.07676377008356373, "language_loss": 0.79803503, "learning_rate": 0.0006096978517643377, "loss": 0.80876303, "num_input_tokens_seen": 193795456, "router_z_loss_mlp": 0.11871338, "routerloss_mlp": 0.0, "step": 2322, "time_per_iteration": 2.8253674507141113 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00921995, "balance_loss_mlp": 1.60181236, "diversity_loss_mlp": 0.21422489, "epoch": 0.4469026548672566, "flos": 512946929664.0, "grad_norm": 0.03237790796068106, "language_loss": 0.83347481, "learning_rate": 0.0006093938786092968, "loss": 0.84269476, "num_input_tokens_seen": 193865520, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01397606, "step": 2323, "time_per_iteration": 2.648444890975952 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110106, "balance_loss_mlp": 1.09840608, "diversity_loss_mlp": 0.0, "epoch": 0.4470950365525202, "flos": 684076078080.0, "grad_norm": 0.07300553293113453, "language_loss": 0.90023661, "learning_rate": 0.0006090898629837857, "loss": 0.91133773, "num_input_tokens_seen": 193935040, "router_z_loss_mlp": 0.11688232, "routerloss_mlp": 0.0, "step": 2324, "time_per_iteration": 2.852698564529419 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01126468, "balance_loss_mlp": 1.11461282, "diversity_loss_mlp": 0.0, "epoch": 0.4472874182377838, "flos": 627321189888.0, "grad_norm": 0.06000654076761871, "language_loss": 0.87143672, "learning_rate": 0.0006087858050058337, "loss": 0.8827014, "num_input_tokens_seen": 194009120, "router_z_loss_mlp": 0.11846924, "routerloss_mlp": 0.0, "step": 2325, "time_per_iteration": 2.7674834728240967 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138358, "balance_loss_mlp": 1.12663388, "diversity_loss_mlp": 0.0, "epoch": 0.4474797999230473, "flos": 547204916736.0, "grad_norm": 0.0853990663964482, "language_loss": 0.82412744, "learning_rate": 0.0006084817047934866, "loss": 0.83551097, "num_input_tokens_seen": 194076672, "router_z_loss_mlp": 0.1171875, "routerloss_mlp": 0.0, "step": 2326, "time_per_iteration": 2.6421871185302734 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121575, "balance_loss_mlp": 1.10977352, "diversity_loss_mlp": 0.0, "epoch": 0.4476721816083109, "flos": 455819083776.0, "grad_norm": 0.08985792381424736, "language_loss": 0.89330196, "learning_rate": 0.0006081775624648066, "loss": 0.90451771, "num_input_tokens_seen": 194142320, "router_z_loss_mlp": 0.11791992, "routerloss_mlp": 0.0, "step": 2327, "time_per_iteration": 2.578197956085205 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01131674, "balance_loss_mlp": 1.12057006, "diversity_loss_mlp": 0.0, "epoch": 0.44786456329357444, "flos": 481518079488.0, "grad_norm": 0.0872530433154025, "language_loss": 0.83162999, "learning_rate": 0.0006078733781378721, "loss": 0.84294665, "num_input_tokens_seen": 194208560, "router_z_loss_mlp": 0.11108398, "routerloss_mlp": 0.0, "step": 2328, "time_per_iteration": 2.6186208724975586 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099348, "balance_loss_mlp": 1.08810675, "diversity_loss_mlp": 0.0, "epoch": 0.448056944978838, "flos": 552104418816.0, "grad_norm": 0.07633837573658239, "language_loss": 0.82202363, "learning_rate": 0.0006075691519307781, "loss": 0.83301711, "num_input_tokens_seen": 194288080, "router_z_loss_mlp": 0.11248779, "routerloss_mlp": 0.0, "step": 2329, "time_per_iteration": 2.9000244140625 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094166, "balance_loss_mlp": 1.08247721, "diversity_loss_mlp": 0.0, "epoch": 0.44824932666410156, "flos": 550839439872.0, "grad_norm": 0.0736281868256213, "language_loss": 0.81618124, "learning_rate": 0.0006072648839616356, "loss": 0.82712287, "num_input_tokens_seen": 194358464, "router_z_loss_mlp": 0.11694336, "routerloss_mlp": 0.0, "step": 2330, "time_per_iteration": 2.6364829540252686 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083826, "balance_loss_mlp": 1.07230425, "diversity_loss_mlp": 0.0, "epoch": 0.44844170834936514, "flos": 988582454784.0, "grad_norm": 0.0657010816534965, "language_loss": 0.82723016, "learning_rate": 0.0006069605743485718, "loss": 0.83806837, "num_input_tokens_seen": 194456112, "router_z_loss_mlp": 0.11517334, "routerloss_mlp": 0.0, "step": 2331, "time_per_iteration": 3.3334474563598633 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01086805, "balance_loss_mlp": 1.07531917, "diversity_loss_mlp": 0.0, "epoch": 0.44863409003462873, "flos": 591321378816.0, "grad_norm": 0.07225675858451452, "language_loss": 0.83265316, "learning_rate": 0.0006066562232097303, "loss": 0.84352124, "num_input_tokens_seen": 194526880, "router_z_loss_mlp": 0.11480713, "routerloss_mlp": 0.0, "step": 2332, "time_per_iteration": 2.705143690109253 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082054, "balance_loss_mlp": 1.07051468, "diversity_loss_mlp": 0.0, "epoch": 0.44882647171989226, "flos": 724646850048.0, "grad_norm": 0.06521315479324259, "language_loss": 0.8614397, "learning_rate": 0.0006063518306632708, "loss": 0.87226027, "num_input_tokens_seen": 194606800, "router_z_loss_mlp": 0.11529541, "routerloss_mlp": 0.0, "step": 2333, "time_per_iteration": 2.9501705169677734 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085822, "balance_loss_mlp": 1.07427073, "diversity_loss_mlp": 0.0, "epoch": 0.44901885340515585, "flos": 534927338496.0, "grad_norm": 0.07251688845149425, "language_loss": 0.82197714, "learning_rate": 0.0006060473968273688, "loss": 0.83283544, "num_input_tokens_seen": 194679856, "router_z_loss_mlp": 0.11553955, "routerloss_mlp": 0.0, "step": 2334, "time_per_iteration": 2.708394765853882 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01039379, "balance_loss_mlp": 1.032179, "diversity_loss_mlp": 0.0, "epoch": 0.4492112350904194, "flos": 1555300942848.0, "grad_norm": 0.02865006957504222, "language_loss": 0.77879542, "learning_rate": 0.000605742921820216, "loss": 0.78918916, "num_input_tokens_seen": 194906320, "router_z_loss_mlp": 0.07177734, "routerloss_mlp": 0.0, "step": 2335, "time_per_iteration": 4.866912841796875 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01026072, "balance_loss_mlp": 1.01901519, "diversity_loss_mlp": 0.0, "epoch": 0.44940361677568297, "flos": 1523358171648.0, "grad_norm": 0.021847156852776353, "language_loss": 0.81005216, "learning_rate": 0.0006054384057600202, "loss": 0.82031286, "num_input_tokens_seen": 195129152, "router_z_loss_mlp": 0.07080078, "routerloss_mlp": 0.0, "step": 2336, "time_per_iteration": 4.834076642990112 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108818, "balance_loss_mlp": 1.07613969, "diversity_loss_mlp": 0.0, "epoch": 0.4495959984609465, "flos": 382495011840.0, "grad_norm": 0.09890748330953583, "language_loss": 0.88285863, "learning_rate": 0.0006051338487650047, "loss": 0.89374042, "num_input_tokens_seen": 195189792, "router_z_loss_mlp": 0.12042236, "routerloss_mlp": 0.0, "step": 2337, "time_per_iteration": 2.4428114891052246 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00930205, "balance_loss_mlp": 1.62015963, "diversity_loss_mlp": 0.20974493, "epoch": 0.4497883801462101, "flos": 497879861760.0, "grad_norm": 0.03186253719782368, "language_loss": 0.82399797, "learning_rate": 0.0006048292509534095, "loss": 0.83329999, "num_input_tokens_seen": 195258640, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01525321, "step": 2338, "time_per_iteration": 2.6332457065582275 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079855, "balance_loss_mlp": 1.06772542, "diversity_loss_mlp": 0.0, "epoch": 0.4499807618314736, "flos": 614450769408.0, "grad_norm": 0.08456945041025239, "language_loss": 0.77873439, "learning_rate": 0.0006045246124434895, "loss": 0.7895329, "num_input_tokens_seen": 195327984, "router_z_loss_mlp": 0.12127686, "routerloss_mlp": 0.0, "step": 2339, "time_per_iteration": 2.7590980529785156 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073061, "balance_loss_mlp": 1.06156278, "diversity_loss_mlp": 0.0, "epoch": 0.4501731435167372, "flos": 1005510288384.0, "grad_norm": 0.06841757056071682, "language_loss": 0.86623305, "learning_rate": 0.0006042199333535162, "loss": 0.87696362, "num_input_tokens_seen": 195409504, "router_z_loss_mlp": 0.1149292, "routerloss_mlp": 0.0, "step": 2340, "time_per_iteration": 3.293574333190918 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079106, "balance_loss_mlp": 1.06769133, "diversity_loss_mlp": 0.0, "epoch": 0.4503655252020008, "flos": 820880428032.0, "grad_norm": 0.06101547553515947, "language_loss": 0.84343052, "learning_rate": 0.0006039152138017763, "loss": 0.85422158, "num_input_tokens_seen": 195489424, "router_z_loss_mlp": 0.11413574, "routerloss_mlp": 0.0, "step": 2341, "time_per_iteration": 3.0700981616973877 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087051, "balance_loss_mlp": 1.07579744, "diversity_loss_mlp": 0.0, "epoch": 0.4505579068872643, "flos": 486373165056.0, "grad_norm": 0.09071323966594208, "language_loss": 0.83541143, "learning_rate": 0.0006036104539065726, "loss": 0.84628195, "num_input_tokens_seen": 195562128, "router_z_loss_mlp": 0.11260986, "routerloss_mlp": 0.0, "step": 2342, "time_per_iteration": 2.6694719791412354 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089407, "balance_loss_mlp": 1.07793319, "diversity_loss_mlp": 0.0, "epoch": 0.4507502885725279, "flos": 884803046400.0, "grad_norm": 0.08270437502254605, "language_loss": 0.84371507, "learning_rate": 0.000603305653786223, "loss": 0.85460913, "num_input_tokens_seen": 195646800, "router_z_loss_mlp": 0.11474609, "routerloss_mlp": 0.0, "step": 2343, "time_per_iteration": 3.16105318069458 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083424, "balance_loss_mlp": 1.07187295, "diversity_loss_mlp": 0.0, "epoch": 0.45094267025779144, "flos": 578339730432.0, "grad_norm": 0.07028076371432387, "language_loss": 0.84103405, "learning_rate": 0.0006030008135590622, "loss": 0.85186827, "num_input_tokens_seen": 195719648, "router_z_loss_mlp": 0.11553955, "routerloss_mlp": 0.0, "step": 2344, "time_per_iteration": 2.7197835445404053 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082164, "balance_loss_mlp": 1.07096398, "diversity_loss_mlp": 0.0, "epoch": 0.45113505194305503, "flos": 525387492864.0, "grad_norm": 0.05864949769745669, "language_loss": 0.7999413, "learning_rate": 0.0006026959333434387, "loss": 0.81076288, "num_input_tokens_seen": 195794800, "router_z_loss_mlp": 0.11199951, "routerloss_mlp": 0.0, "step": 2345, "time_per_iteration": 2.777010202407837 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00919083, "balance_loss_mlp": 1.6008426, "diversity_loss_mlp": 0.20793086, "epoch": 0.45132743362831856, "flos": 502055470080.0, "grad_norm": 0.028469676504860836, "language_loss": 0.77684712, "learning_rate": 0.0006023910132577181, "loss": 0.78603798, "num_input_tokens_seen": 195866848, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01469593, "step": 2346, "time_per_iteration": 2.689173936843872 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01093318, "balance_loss_mlp": 1.08186746, "diversity_loss_mlp": 0.0, "epoch": 0.45151981531358215, "flos": 431918811648.0, "grad_norm": 0.07173117007756048, "language_loss": 0.84956741, "learning_rate": 0.0006020860534202806, "loss": 0.86050057, "num_input_tokens_seen": 195930640, "router_z_loss_mlp": 0.11450195, "routerloss_mlp": 0.0, "step": 2347, "time_per_iteration": 2.499941110610962 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099048, "balance_loss_mlp": 1.08747303, "diversity_loss_mlp": 0.0, "epoch": 0.4517121969988457, "flos": 712159299072.0, "grad_norm": 0.06525031943024168, "language_loss": 0.81076705, "learning_rate": 0.0006017810539495224, "loss": 0.82175756, "num_input_tokens_seen": 196014240, "router_z_loss_mlp": 0.11572266, "routerloss_mlp": 0.0, "step": 2348, "time_per_iteration": 2.9487318992614746 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094541, "balance_loss_mlp": 1.08284068, "diversity_loss_mlp": 0.0, "epoch": 0.45190457868410927, "flos": 579468888576.0, "grad_norm": 0.07881291561071736, "language_loss": 0.82607108, "learning_rate": 0.0006014760149638547, "loss": 0.83701646, "num_input_tokens_seen": 196083296, "router_z_loss_mlp": 0.11700439, "routerloss_mlp": 0.0, "step": 2349, "time_per_iteration": 2.7228691577911377 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01096509, "balance_loss_mlp": 1.0852139, "diversity_loss_mlp": 0.0, "epoch": 0.45209696036937286, "flos": 482657149440.0, "grad_norm": 0.08019466042103662, "language_loss": 0.88398969, "learning_rate": 0.000601170936581704, "loss": 0.8949548, "num_input_tokens_seen": 196147840, "router_z_loss_mlp": 0.112854, "routerloss_mlp": 0.0, "step": 2350, "time_per_iteration": 2.521714687347412 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090727, "balance_loss_mlp": 1.07951522, "diversity_loss_mlp": 0.0, "epoch": 0.4522893420546364, "flos": 540207512064.0, "grad_norm": 0.08533615412567333, "language_loss": 0.84897137, "learning_rate": 0.0006008658189215121, "loss": 0.85987866, "num_input_tokens_seen": 196219008, "router_z_loss_mlp": 0.11199951, "routerloss_mlp": 0.0, "step": 2351, "time_per_iteration": 2.6506216526031494 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087074, "balance_loss_mlp": 1.07545722, "diversity_loss_mlp": 0.0, "epoch": 0.4524817237399, "flos": 496676551680.0, "grad_norm": 0.09237808795246917, "language_loss": 0.80232167, "learning_rate": 0.0006005606621017366, "loss": 0.81319243, "num_input_tokens_seen": 196287792, "router_z_loss_mlp": 0.1161499, "routerloss_mlp": 0.0, "step": 2352, "time_per_iteration": 2.5878968238830566 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010807, "balance_loss_mlp": 1.06907678, "diversity_loss_mlp": 0.0, "epoch": 0.4526741054251635, "flos": 652550681088.0, "grad_norm": 0.07057821380790058, "language_loss": 0.80339801, "learning_rate": 0.0006002554662408496, "loss": 0.81420493, "num_input_tokens_seen": 196371776, "router_z_loss_mlp": 0.1161499, "routerloss_mlp": 0.0, "step": 2353, "time_per_iteration": 2.883782386779785 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080645, "balance_loss_mlp": 1.0691061, "diversity_loss_mlp": 0.0, "epoch": 0.4528664871104271, "flos": 570939632640.0, "grad_norm": 0.0736680584084088, "language_loss": 0.9135446, "learning_rate": 0.0005999502314573388, "loss": 0.9243511, "num_input_tokens_seen": 196441840, "router_z_loss_mlp": 0.11535645, "routerloss_mlp": 0.0, "step": 2354, "time_per_iteration": 2.645484685897827 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103614, "balance_loss_mlp": 1.09201527, "diversity_loss_mlp": 0.0, "epoch": 0.45305886879569063, "flos": 458719801344.0, "grad_norm": 0.07036557956994945, "language_loss": 0.86196381, "learning_rate": 0.0005996449578697066, "loss": 0.87299991, "num_input_tokens_seen": 196510464, "router_z_loss_mlp": 0.11602783, "routerloss_mlp": 0.0, "step": 2355, "time_per_iteration": 2.648574113845825 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00906536, "balance_loss_mlp": 1.57839537, "diversity_loss_mlp": 0.20635399, "epoch": 0.4532512504809542, "flos": 505178643456.0, "grad_norm": 0.031145483684461562, "language_loss": 0.81619978, "learning_rate": 0.0005993396455964709, "loss": 0.82526517, "num_input_tokens_seen": 196583888, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01416124, "step": 2356, "time_per_iteration": 2.7277767658233643 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0115937, "balance_loss_mlp": 1.14805746, "diversity_loss_mlp": 0.0, "epoch": 0.4534436321662178, "flos": 582213961728.0, "grad_norm": 0.07904312092760724, "language_loss": 0.81657517, "learning_rate": 0.0005990342947561647, "loss": 0.82816887, "num_input_tokens_seen": 196652816, "router_z_loss_mlp": 0.11315918, "routerloss_mlp": 0.0, "step": 2357, "time_per_iteration": 2.696223258972168 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01167894, "balance_loss_mlp": 1.15651524, "diversity_loss_mlp": 0.0, "epoch": 0.45363601385148133, "flos": 549720193536.0, "grad_norm": 0.07381995676601517, "language_loss": 0.78198934, "learning_rate": 0.0005987289054673351, "loss": 0.79366827, "num_input_tokens_seen": 196720208, "router_z_loss_mlp": 0.1137085, "routerloss_mlp": 0.0, "step": 2358, "time_per_iteration": 2.602642059326172 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01360078, "balance_loss_mlp": 1.35392714, "diversity_loss_mlp": 0.0, "epoch": 0.4538283955367449, "flos": 1474559520768.0, "grad_norm": 0.12195170998658643, "language_loss": 0.76575738, "learning_rate": 0.0005984234778485451, "loss": 0.77935815, "num_input_tokens_seen": 196947696, "router_z_loss_mlp": 0.06152344, "routerloss_mlp": 0.0, "step": 2359, "time_per_iteration": 4.880090713500977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01146892, "balance_loss_mlp": 1.13553107, "diversity_loss_mlp": 0.0, "epoch": 0.45402077722200845, "flos": 584711986176.0, "grad_norm": 0.07250720881476776, "language_loss": 0.91548061, "learning_rate": 0.0005981180120183722, "loss": 0.9269495, "num_input_tokens_seen": 197015712, "router_z_loss_mlp": 0.11364746, "routerloss_mlp": 0.0, "step": 2360, "time_per_iteration": 2.680730104446411 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01133243, "balance_loss_mlp": 1.121382, "diversity_loss_mlp": 0.0, "epoch": 0.45421315890727204, "flos": 531747265536.0, "grad_norm": 0.055968167495159496, "language_loss": 0.85338825, "learning_rate": 0.0005978125080954089, "loss": 0.8647207, "num_input_tokens_seen": 197094880, "router_z_loss_mlp": 0.11853027, "routerloss_mlp": 0.0, "step": 2361, "time_per_iteration": 2.791376829147339 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124184, "balance_loss_mlp": 1.11265099, "diversity_loss_mlp": 0.0, "epoch": 0.4544055405925356, "flos": 785221641216.0, "grad_norm": 0.08653591933533131, "language_loss": 0.77322888, "learning_rate": 0.000597506966198262, "loss": 0.7844708, "num_input_tokens_seen": 197176448, "router_z_loss_mlp": 0.11529541, "routerloss_mlp": 0.0, "step": 2362, "time_per_iteration": 2.97446870803833 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119088, "balance_loss_mlp": 1.10733426, "diversity_loss_mlp": 0.0, "epoch": 0.45459792227779916, "flos": 518199939072.0, "grad_norm": 0.09240364374598002, "language_loss": 0.84247041, "learning_rate": 0.0005972013864455536, "loss": 0.85366124, "num_input_tokens_seen": 197243520, "router_z_loss_mlp": 0.11743164, "routerloss_mlp": 0.0, "step": 2363, "time_per_iteration": 2.577167510986328 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108786, "balance_loss_mlp": 1.09771168, "diversity_loss_mlp": 0.0, "epoch": 0.4547903039630627, "flos": 537563755008.0, "grad_norm": 0.0787330127694287, "language_loss": 0.8535012, "learning_rate": 0.0005968957689559203, "loss": 0.8645891, "num_input_tokens_seen": 197311536, "router_z_loss_mlp": 0.11077881, "routerloss_mlp": 0.0, "step": 2364, "time_per_iteration": 2.7120981216430664 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105615, "balance_loss_mlp": 1.09457588, "diversity_loss_mlp": 0.0, "epoch": 0.4549826856483263, "flos": 528676222464.0, "grad_norm": 0.07389843074969835, "language_loss": 0.88484383, "learning_rate": 0.0005965901138480131, "loss": 0.89590001, "num_input_tokens_seen": 197382752, "router_z_loss_mlp": 0.1104126, "routerloss_mlp": 0.0, "step": 2365, "time_per_iteration": 2.578874349594116 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110202, "balance_loss_mlp": 1.09081471, "diversity_loss_mlp": 0.0, "epoch": 0.45517506733358987, "flos": 520915276800.0, "grad_norm": 0.06426783448513047, "language_loss": 0.87068385, "learning_rate": 0.0005962844212404982, "loss": 0.88170409, "num_input_tokens_seen": 197456592, "router_z_loss_mlp": 0.11206055, "routerloss_mlp": 0.0, "step": 2366, "time_per_iteration": 2.6638920307159424 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01096105, "balance_loss_mlp": 1.08472049, "diversity_loss_mlp": 0.0, "epoch": 0.4553674490188534, "flos": 451052831232.0, "grad_norm": 0.05830156527831164, "language_loss": 0.87147355, "learning_rate": 0.0005959786912520558, "loss": 0.88243461, "num_input_tokens_seen": 197525408, "router_z_loss_mlp": 0.11376953, "routerloss_mlp": 0.0, "step": 2367, "time_per_iteration": 2.6142454147338867 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088371, "balance_loss_mlp": 1.07726681, "diversity_loss_mlp": 0.0, "epoch": 0.455559830704117, "flos": 546594448896.0, "grad_norm": 0.06261196085687584, "language_loss": 0.83712542, "learning_rate": 0.0005956729240013806, "loss": 0.84800917, "num_input_tokens_seen": 197608480, "router_z_loss_mlp": 0.11108398, "routerloss_mlp": 0.0, "step": 2368, "time_per_iteration": 2.786256790161133 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095858, "balance_loss_mlp": 1.08447385, "diversity_loss_mlp": 0.0, "epoch": 0.4557522123893805, "flos": 583765636608.0, "grad_norm": 0.06874460659515655, "language_loss": 0.91648531, "learning_rate": 0.0005953671196071824, "loss": 0.92744386, "num_input_tokens_seen": 197678416, "router_z_loss_mlp": 0.11383057, "routerloss_mlp": 0.0, "step": 2369, "time_per_iteration": 2.756943941116333 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01093695, "balance_loss_mlp": 1.08220375, "diversity_loss_mlp": 0.0, "epoch": 0.4559445940746441, "flos": 526415334912.0, "grad_norm": 0.07258619671695062, "language_loss": 0.80044961, "learning_rate": 0.0005950612781881846, "loss": 0.81138659, "num_input_tokens_seen": 197753424, "router_z_loss_mlp": 0.1149292, "routerloss_mlp": 0.0, "step": 2370, "time_per_iteration": 2.6791019439697266 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00906758, "balance_loss_mlp": 1.57760763, "diversity_loss_mlp": 0.20680004, "epoch": 0.45613697575990764, "flos": 652120823808.0, "grad_norm": 0.03266097765038979, "language_loss": 0.76005763, "learning_rate": 0.0005947553998631259, "loss": 0.76912522, "num_input_tokens_seen": 197832080, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01455403, "step": 2371, "time_per_iteration": 2.908493995666504 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010769, "balance_loss_mlp": 1.06543183, "diversity_loss_mlp": 0.0, "epoch": 0.4563293574451712, "flos": 867119385600.0, "grad_norm": 0.05564189265933484, "language_loss": 0.79205543, "learning_rate": 0.000594449484750758, "loss": 0.80282438, "num_input_tokens_seen": 197919536, "router_z_loss_mlp": 0.11462402, "routerloss_mlp": 0.0, "step": 2372, "time_per_iteration": 3.18151593208313 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072251, "balance_loss_mlp": 1.06046152, "diversity_loss_mlp": 0.0, "epoch": 0.45652173913043476, "flos": 498079922688.0, "grad_norm": 0.07444834598910231, "language_loss": 0.83208215, "learning_rate": 0.0005941435329698484, "loss": 0.84280467, "num_input_tokens_seen": 197991872, "router_z_loss_mlp": 0.11785889, "routerloss_mlp": 0.0, "step": 2373, "time_per_iteration": 2.6709630489349365 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107048, "balance_loss_mlp": 1.05895281, "diversity_loss_mlp": 0.0, "epoch": 0.45671412081569834, "flos": 560856130560.0, "grad_norm": 0.06837725942446468, "language_loss": 0.83204812, "learning_rate": 0.0005938375446391778, "loss": 0.84275293, "num_input_tokens_seen": 198063392, "router_z_loss_mlp": 0.11529541, "routerloss_mlp": 0.0, "step": 2374, "time_per_iteration": 2.6943106651306152 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074517, "balance_loss_mlp": 1.06261396, "diversity_loss_mlp": 0.0, "epoch": 0.45690650250096193, "flos": 503122959360.0, "grad_norm": 0.0748623734907781, "language_loss": 0.8912878, "learning_rate": 0.0005935315198775415, "loss": 0.90203297, "num_input_tokens_seen": 198131232, "router_z_loss_mlp": 0.11901855, "routerloss_mlp": 0.0, "step": 2375, "time_per_iteration": 2.6303911209106445 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066821, "balance_loss_mlp": 1.05491209, "diversity_loss_mlp": 0.0, "epoch": 0.45709888418622546, "flos": 430698249216.0, "grad_norm": 0.06590971106227904, "language_loss": 0.87093645, "learning_rate": 0.0005932254588037486, "loss": 0.88160467, "num_input_tokens_seen": 198194944, "router_z_loss_mlp": 0.11907959, "routerloss_mlp": 0.0, "step": 2376, "time_per_iteration": 2.5003554821014404 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106434, "balance_loss_mlp": 1.0520016, "diversity_loss_mlp": 0.0, "epoch": 0.45729126587148905, "flos": 525654365184.0, "grad_norm": 0.07188519107297629, "language_loss": 0.86239958, "learning_rate": 0.000592919361536623, "loss": 0.87304294, "num_input_tokens_seen": 198265728, "router_z_loss_mlp": 0.12335205, "routerloss_mlp": 0.0, "step": 2377, "time_per_iteration": 2.6426758766174316 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106134, "balance_loss_mlp": 1.04946113, "diversity_loss_mlp": 0.0, "epoch": 0.4574836475567526, "flos": 638002676736.0, "grad_norm": 0.06083573176815847, "language_loss": 0.88679874, "learning_rate": 0.0005926132281950017, "loss": 0.89741206, "num_input_tokens_seen": 198336640, "router_z_loss_mlp": 0.11871338, "routerloss_mlp": 0.0, "step": 2378, "time_per_iteration": 2.7510690689086914 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065869, "balance_loss_mlp": 1.05310154, "diversity_loss_mlp": 0.0, "epoch": 0.45767602924201617, "flos": 649588294656.0, "grad_norm": 0.07940360452878177, "language_loss": 0.85365742, "learning_rate": 0.0005923070588977367, "loss": 0.86431611, "num_input_tokens_seen": 198413552, "router_z_loss_mlp": 0.12774658, "routerloss_mlp": 0.0, "step": 2379, "time_per_iteration": 2.7969985008239746 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066405, "balance_loss_mlp": 1.05444837, "diversity_loss_mlp": 0.0, "epoch": 0.4578684109272797, "flos": 746676817920.0, "grad_norm": 0.06398281947580985, "language_loss": 0.86384034, "learning_rate": 0.0005920008537636931, "loss": 0.87450439, "num_input_tokens_seen": 198490864, "router_z_loss_mlp": 0.11956787, "routerloss_mlp": 0.0, "step": 2380, "time_per_iteration": 2.90964412689209 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066454, "balance_loss_mlp": 1.05391335, "diversity_loss_mlp": 0.0, "epoch": 0.4580607926125433, "flos": 641469072384.0, "grad_norm": 0.05698304417859526, "language_loss": 0.86739266, "learning_rate": 0.0005916946129117504, "loss": 0.87805718, "num_input_tokens_seen": 198571200, "router_z_loss_mlp": 0.12548828, "routerloss_mlp": 0.0, "step": 2381, "time_per_iteration": 2.9013612270355225 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074542, "balance_loss_mlp": 1.06223381, "diversity_loss_mlp": 0.0, "epoch": 0.4582531742978069, "flos": 801857636352.0, "grad_norm": 0.07634094682432664, "language_loss": 0.80304879, "learning_rate": 0.0005913883364608017, "loss": 0.81379426, "num_input_tokens_seen": 198658624, "router_z_loss_mlp": 0.12298584, "routerloss_mlp": 0.0, "step": 2382, "time_per_iteration": 3.086503505706787 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108411, "balance_loss_mlp": 1.07212973, "diversity_loss_mlp": 0.0, "epoch": 0.4584455559830704, "flos": 684295962624.0, "grad_norm": 0.06243795661807547, "language_loss": 0.8841778, "learning_rate": 0.0005910820245297542, "loss": 0.89501894, "num_input_tokens_seen": 198731312, "router_z_loss_mlp": 0.11975098, "routerloss_mlp": 0.0, "step": 2383, "time_per_iteration": 2.8612842559814453 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090258, "balance_loss_mlp": 1.07756186, "diversity_loss_mlp": 0.0, "epoch": 0.458637937668334, "flos": 518177544192.0, "grad_norm": 0.08243832238560393, "language_loss": 0.80972016, "learning_rate": 0.000590775677237529, "loss": 0.82062268, "num_input_tokens_seen": 198805296, "router_z_loss_mlp": 0.12695312, "routerloss_mlp": 0.0, "step": 2384, "time_per_iteration": 2.731405735015869 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094631, "balance_loss_mlp": 1.08257282, "diversity_loss_mlp": 0.0, "epoch": 0.4588303193535975, "flos": 505499844096.0, "grad_norm": 0.07578687885193977, "language_loss": 0.80532229, "learning_rate": 0.0005904692947030601, "loss": 0.81626856, "num_input_tokens_seen": 198872112, "router_z_loss_mlp": 0.1204834, "routerloss_mlp": 0.0, "step": 2385, "time_per_iteration": 2.6176209449768066 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106556, "balance_loss_mlp": 1.09437895, "diversity_loss_mlp": 0.0, "epoch": 0.4590227010388611, "flos": 495905670144.0, "grad_norm": 0.08078833732724985, "language_loss": 0.8953619, "learning_rate": 0.0005901628770452963, "loss": 0.90642744, "num_input_tokens_seen": 198938480, "router_z_loss_mlp": 0.1217041, "routerloss_mlp": 0.0, "step": 2386, "time_per_iteration": 2.5513737201690674 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115712, "balance_loss_mlp": 1.10345697, "diversity_loss_mlp": 0.0, "epoch": 0.45921508272412465, "flos": 493620189696.0, "grad_norm": 0.09403156888929357, "language_loss": 0.87502134, "learning_rate": 0.000589856424383199, "loss": 0.88617843, "num_input_tokens_seen": 199008608, "router_z_loss_mlp": 0.12255859, "routerloss_mlp": 0.0, "step": 2387, "time_per_iteration": 2.599862813949585 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111298, "balance_loss_mlp": 1.10114813, "diversity_loss_mlp": 0.0, "epoch": 0.45940746440938823, "flos": 691394683392.0, "grad_norm": 0.08117329221401763, "language_loss": 0.8309918, "learning_rate": 0.000589549936835744, "loss": 0.8421216, "num_input_tokens_seen": 199084592, "router_z_loss_mlp": 0.11828613, "routerloss_mlp": 0.0, "step": 2388, "time_per_iteration": 2.914754867553711 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101869, "balance_loss_mlp": 1.0899775, "diversity_loss_mlp": 0.0, "epoch": 0.45959984609465176, "flos": 503738196480.0, "grad_norm": 0.06559429512714879, "language_loss": 0.79056096, "learning_rate": 0.0005892434145219202, "loss": 0.80157959, "num_input_tokens_seen": 199151504, "router_z_loss_mlp": 0.11883545, "routerloss_mlp": 0.0, "step": 2389, "time_per_iteration": 2.6295268535614014 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00898813, "balance_loss_mlp": 1.5620172, "diversity_loss_mlp": 0.2081904, "epoch": 0.45979222777991535, "flos": 676638904320.0, "grad_norm": 0.0365067866217014, "language_loss": 0.82780147, "learning_rate": 0.0005889368575607303, "loss": 0.83678961, "num_input_tokens_seen": 199224528, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01370906, "step": 2390, "time_per_iteration": 2.8635401725769043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089349, "balance_loss_mlp": 1.07753515, "diversity_loss_mlp": 0.0, "epoch": 0.45998460946517894, "flos": 777653415936.0, "grad_norm": 0.056196182118315396, "language_loss": 0.78421402, "learning_rate": 0.00058863026607119, "loss": 0.79510748, "num_input_tokens_seen": 199312512, "router_z_loss_mlp": 0.11816406, "routerloss_mlp": 0.0, "step": 2391, "time_per_iteration": 3.0734708309173584 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099092, "balance_loss_mlp": 1.08715332, "diversity_loss_mlp": 0.0, "epoch": 0.46017699115044247, "flos": 851461673472.0, "grad_norm": 0.07079174515079527, "language_loss": 0.795928, "learning_rate": 0.0005883236401723287, "loss": 0.80691886, "num_input_tokens_seen": 199397216, "router_z_loss_mlp": 0.11932373, "routerloss_mlp": 0.0, "step": 2392, "time_per_iteration": 3.1697676181793213 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095241, "balance_loss_mlp": 1.08348131, "diversity_loss_mlp": 0.0, "epoch": 0.46036937283570606, "flos": 575878781952.0, "grad_norm": 0.08882239564338372, "language_loss": 0.84418833, "learning_rate": 0.0005880169799831893, "loss": 0.85514069, "num_input_tokens_seen": 199464288, "router_z_loss_mlp": 0.11761475, "routerloss_mlp": 0.0, "step": 2393, "time_per_iteration": 2.668509006500244 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095056, "balance_loss_mlp": 1.08327174, "diversity_loss_mlp": 0.0, "epoch": 0.4605617545209696, "flos": 611866109952.0, "grad_norm": 0.06874062850812142, "language_loss": 0.81593782, "learning_rate": 0.0005877102856228278, "loss": 0.82688844, "num_input_tokens_seen": 199538096, "router_z_loss_mlp": 0.11779785, "routerloss_mlp": 0.0, "step": 2394, "time_per_iteration": 2.862039566040039 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099301, "balance_loss_mlp": 1.08791018, "diversity_loss_mlp": 0.0, "epoch": 0.4607541362062332, "flos": 533138526720.0, "grad_norm": 0.07005170830273995, "language_loss": 0.84822053, "learning_rate": 0.0005874035572103133, "loss": 0.85921353, "num_input_tokens_seen": 199609504, "router_z_loss_mlp": 0.1138916, "routerloss_mlp": 0.0, "step": 2395, "time_per_iteration": 2.660466194152832 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092906, "balance_loss_mlp": 1.08152771, "diversity_loss_mlp": 0.0, "epoch": 0.4609465178914967, "flos": 647312726016.0, "grad_norm": 0.09691208121118819, "language_loss": 0.82382149, "learning_rate": 0.0005870967948647288, "loss": 0.83475053, "num_input_tokens_seen": 199678960, "router_z_loss_mlp": 0.11383057, "routerloss_mlp": 0.0, "step": 2396, "time_per_iteration": 2.8379006385803223 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01259876, "balance_loss_mlp": 1.25238955, "diversity_loss_mlp": 0.0, "epoch": 0.4611388995767603, "flos": 1466287225344.0, "grad_norm": 0.08205623370138872, "language_loss": 0.743083, "learning_rate": 0.0005867899987051693, "loss": 0.75568175, "num_input_tokens_seen": 199903568, "router_z_loss_mlp": 0.07470703, "routerloss_mlp": 0.0, "step": 2397, "time_per_iteration": 5.0380027294158936 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00912357, "balance_loss_mlp": 1.5885272, "diversity_loss_mlp": 0.20776251, "epoch": 0.46133128126202383, "flos": 723112427520.0, "grad_norm": 0.030510515868204604, "language_loss": 0.86040902, "learning_rate": 0.0005864831688507443, "loss": 0.86953259, "num_input_tokens_seen": 199988672, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0142122, "step": 2398, "time_per_iteration": 2.9795196056365967 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099565, "balance_loss_mlp": 1.08854449, "diversity_loss_mlp": 0.0, "epoch": 0.4615236629472874, "flos": 548010302976.0, "grad_norm": 0.07495608045078013, "language_loss": 0.75224954, "learning_rate": 0.0005861763054205754, "loss": 0.76324517, "num_input_tokens_seen": 200062304, "router_z_loss_mlp": 0.11022949, "routerloss_mlp": 0.0, "step": 2399, "time_per_iteration": 2.7307660579681396 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00908198, "balance_loss_mlp": 1.58042729, "diversity_loss_mlp": 0.20863593, "epoch": 0.461716044632551, "flos": 602244771840.0, "grad_norm": 0.03052990379504839, "language_loss": 0.8056978, "learning_rate": 0.0005858694085337976, "loss": 0.81477976, "num_input_tokens_seen": 200138464, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01366598, "step": 2400, "time_per_iteration": 2.8421711921691895 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01115275, "balance_loss_mlp": 1.10424817, "diversity_loss_mlp": 0.0, "epoch": 0.46190842631781454, "flos": 474476258304.0, "grad_norm": 0.08470381171074581, "language_loss": 0.8355788, "learning_rate": 0.0005855624783095589, "loss": 0.84673154, "num_input_tokens_seen": 200205728, "router_z_loss_mlp": 0.11022949, "routerloss_mlp": 0.0, "step": 2401, "time_per_iteration": 2.554006814956665 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01114727, "balance_loss_mlp": 1.10386109, "diversity_loss_mlp": 0.0, "epoch": 0.4621008080030781, "flos": 437483109888.0, "grad_norm": 0.07139821582333657, "language_loss": 0.85265267, "learning_rate": 0.00058525551486702, "loss": 0.86379993, "num_input_tokens_seen": 200269824, "router_z_loss_mlp": 0.10876465, "routerloss_mlp": 0.0, "step": 2402, "time_per_iteration": 2.5159239768981934 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119193, "balance_loss_mlp": 1.10795164, "diversity_loss_mlp": 0.0, "epoch": 0.46229318968834165, "flos": 525461644800.0, "grad_norm": 0.08747389081307531, "language_loss": 0.80850065, "learning_rate": 0.0005849485183253548, "loss": 0.81969261, "num_input_tokens_seen": 200341264, "router_z_loss_mlp": 0.11242676, "routerloss_mlp": 0.0, "step": 2403, "time_per_iteration": 2.643031358718872 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110943, "balance_loss_mlp": 1.09971905, "diversity_loss_mlp": 0.0, "epoch": 0.46248557137360524, "flos": 439622857728.0, "grad_norm": 0.06974006499463392, "language_loss": 0.8764264, "learning_rate": 0.0005846414888037501, "loss": 0.88753581, "num_input_tokens_seen": 200405632, "router_z_loss_mlp": 0.11224365, "routerloss_mlp": 0.0, "step": 2404, "time_per_iteration": 2.4847412109375 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091069, "balance_loss_mlp": 1.07962489, "diversity_loss_mlp": 0.0, "epoch": 0.4626779530588688, "flos": 617608447488.0, "grad_norm": 0.07303422211334305, "language_loss": 0.82384312, "learning_rate": 0.0005843344264214049, "loss": 0.83475375, "num_input_tokens_seen": 200479312, "router_z_loss_mlp": 0.11444092, "routerloss_mlp": 0.0, "step": 2405, "time_per_iteration": 2.7470028400421143 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01093931, "balance_loss_mlp": 1.08265948, "diversity_loss_mlp": 0.0, "epoch": 0.46287033474413236, "flos": 670108432896.0, "grad_norm": 0.06660378994806349, "language_loss": 0.84838545, "learning_rate": 0.0005840273312975317, "loss": 0.85932475, "num_input_tokens_seen": 200552976, "router_z_loss_mlp": 0.11273193, "routerloss_mlp": 0.0, "step": 2406, "time_per_iteration": 2.834179162979126 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082019, "balance_loss_mlp": 1.07018733, "diversity_loss_mlp": 0.0, "epoch": 0.46306271642939595, "flos": 480233276928.0, "grad_norm": 0.07201348711751891, "language_loss": 0.89853442, "learning_rate": 0.0005837202035513555, "loss": 0.90935457, "num_input_tokens_seen": 200621088, "router_z_loss_mlp": 0.11828613, "routerloss_mlp": 0.0, "step": 2407, "time_per_iteration": 2.578505277633667 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081302, "balance_loss_mlp": 1.06933987, "diversity_loss_mlp": 0.0, "epoch": 0.4632550981146595, "flos": 580686879744.0, "grad_norm": 0.06479654524201506, "language_loss": 0.81299376, "learning_rate": 0.0005834130433021136, "loss": 0.82380676, "num_input_tokens_seen": 200698400, "router_z_loss_mlp": 0.11956787, "routerloss_mlp": 0.0, "step": 2408, "time_per_iteration": 2.742830991744995 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075359, "balance_loss_mlp": 1.0631156, "diversity_loss_mlp": 0.0, "epoch": 0.46344747979992307, "flos": 523964298240.0, "grad_norm": 0.06628126289532602, "language_loss": 0.73402894, "learning_rate": 0.0005831058506690563, "loss": 0.74478251, "num_input_tokens_seen": 200767264, "router_z_loss_mlp": 0.12237549, "routerloss_mlp": 0.0, "step": 2409, "time_per_iteration": 2.6239566802978516 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00875374, "balance_loss_mlp": 1.5126431, "diversity_loss_mlp": 0.20975235, "epoch": 0.4636398614851866, "flos": 746501349888.0, "grad_norm": 0.03030502692098504, "language_loss": 0.86162984, "learning_rate": 0.0005827986257714464, "loss": 0.87038362, "num_input_tokens_seen": 200841440, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01417591, "step": 2410, "time_per_iteration": 2.9302031993865967 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069185, "balance_loss_mlp": 1.05664992, "diversity_loss_mlp": 0.0, "epoch": 0.4638322431704502, "flos": 596547224064.0, "grad_norm": 0.07558638886093381, "language_loss": 0.88803709, "learning_rate": 0.0005824913687285591, "loss": 0.89872897, "num_input_tokens_seen": 200911296, "router_z_loss_mlp": 0.12542725, "routerloss_mlp": 0.0, "step": 2411, "time_per_iteration": 2.685814142227173 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070655, "balance_loss_mlp": 1.05821514, "diversity_loss_mlp": 0.0, "epoch": 0.4640246248557137, "flos": 539443971072.0, "grad_norm": 0.1080687232114875, "language_loss": 0.81367224, "learning_rate": 0.0005821840796596821, "loss": 0.82437879, "num_input_tokens_seen": 200981920, "router_z_loss_mlp": 0.12445068, "routerloss_mlp": 0.0, "step": 2412, "time_per_iteration": 2.6551058292388916 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073612, "balance_loss_mlp": 1.06099916, "diversity_loss_mlp": 0.0, "epoch": 0.4642170065409773, "flos": 562625118720.0, "grad_norm": 0.07026214254932567, "language_loss": 0.80428362, "learning_rate": 0.0005818767586841158, "loss": 0.81501973, "num_input_tokens_seen": 201059392, "router_z_loss_mlp": 0.12609863, "routerloss_mlp": 0.0, "step": 2413, "time_per_iteration": 2.759437322616577 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085225, "balance_loss_mlp": 1.07259476, "diversity_loss_mlp": 0.0, "epoch": 0.46440938822624084, "flos": 530959131648.0, "grad_norm": 0.08627931539992734, "language_loss": 0.86441922, "learning_rate": 0.0005815694059211726, "loss": 0.8752715, "num_input_tokens_seen": 201130192, "router_z_loss_mlp": 0.12640381, "routerloss_mlp": 0.0, "step": 2414, "time_per_iteration": 2.658977746963501 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01171514, "balance_loss_mlp": 1.16250181, "diversity_loss_mlp": 0.0, "epoch": 0.4646017699115044, "flos": 1526325700608.0, "grad_norm": 0.047494824411654174, "language_loss": 0.80873632, "learning_rate": 0.0005812620214901778, "loss": 0.82045138, "num_input_tokens_seen": 201354720, "router_z_loss_mlp": 0.09033203, "routerloss_mlp": 0.0, "step": 2415, "time_per_iteration": 4.799519777297974 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01145606, "balance_loss_mlp": 1.13711834, "diversity_loss_mlp": 0.0, "epoch": 0.464794151596768, "flos": 1540831859712.0, "grad_norm": 0.043373387729815825, "language_loss": 0.7694506, "learning_rate": 0.000580954605510468, "loss": 0.78090668, "num_input_tokens_seen": 201592096, "router_z_loss_mlp": 0.08496094, "routerloss_mlp": 0.0, "step": 2416, "time_per_iteration": 4.990553379058838 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0087124, "balance_loss_mlp": 1.50839305, "diversity_loss_mlp": 0.20828754, "epoch": 0.46498653328203154, "flos": 501467397120.0, "grad_norm": 0.030578892859867562, "language_loss": 0.86378521, "learning_rate": 0.0005806471581013931, "loss": 0.87249762, "num_input_tokens_seen": 201666160, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01289999, "step": 2417, "time_per_iteration": 2.6900436878204346 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01122345, "balance_loss_mlp": 1.11040044, "diversity_loss_mlp": 0.0, "epoch": 0.46517891496729513, "flos": 676144806912.0, "grad_norm": 0.07418438196536063, "language_loss": 0.78360349, "learning_rate": 0.0005803396793823146, "loss": 0.79482698, "num_input_tokens_seen": 201733552, "router_z_loss_mlp": 0.1194458, "routerloss_mlp": 0.0, "step": 2418, "time_per_iteration": 2.8027873039245605 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0113389, "balance_loss_mlp": 1.12212396, "diversity_loss_mlp": 0.0, "epoch": 0.46537129665255866, "flos": 585351816192.0, "grad_norm": 0.07660062238284089, "language_loss": 0.85582161, "learning_rate": 0.0005800321694726065, "loss": 0.86716056, "num_input_tokens_seen": 201806128, "router_z_loss_mlp": 0.11761475, "routerloss_mlp": 0.0, "step": 2419, "time_per_iteration": 4.293209075927734 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00870744, "balance_loss_mlp": 1.50698626, "diversity_loss_mlp": 0.20827082, "epoch": 0.46556367833782225, "flos": 587704108032.0, "grad_norm": 0.03270390918014964, "language_loss": 0.86636543, "learning_rate": 0.0005797246284916545, "loss": 0.87507284, "num_input_tokens_seen": 201874224, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01311516, "step": 2420, "time_per_iteration": 2.7184417247772217 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112587, "balance_loss_mlp": 1.1061976, "diversity_loss_mlp": 0.0, "epoch": 0.4657560600230858, "flos": 1485453551616.0, "grad_norm": 0.04763479459010098, "language_loss": 0.77505189, "learning_rate": 0.0005794170565588569, "loss": 0.78617769, "num_input_tokens_seen": 202111648, "router_z_loss_mlp": 0.06396484, "routerloss_mlp": 0.0, "step": 2421, "time_per_iteration": 4.978823900222778 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01164162, "balance_loss_mlp": 1.1527952, "diversity_loss_mlp": 0.0, "epoch": 0.46594844170834937, "flos": 580247110656.0, "grad_norm": 0.08359324638355049, "language_loss": 0.87635398, "learning_rate": 0.0005791094537936233, "loss": 0.8879956, "num_input_tokens_seen": 202183344, "router_z_loss_mlp": 0.1137085, "routerloss_mlp": 0.0, "step": 2422, "time_per_iteration": 2.706270217895508 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01145768, "balance_loss_mlp": 1.1349256, "diversity_loss_mlp": 0.0, "epoch": 0.4661408233936129, "flos": 512571400704.0, "grad_norm": 0.07317342210777962, "language_loss": 0.81790811, "learning_rate": 0.0005788018203153762, "loss": 0.82936579, "num_input_tokens_seen": 202252512, "router_z_loss_mlp": 0.10845947, "routerloss_mlp": 0.0, "step": 2423, "time_per_iteration": 2.5965187549591064 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0114513, "balance_loss_mlp": 1.13404965, "diversity_loss_mlp": 0.0, "epoch": 0.4663332050788765, "flos": 491077748736.0, "grad_norm": 0.08308161607945047, "language_loss": 0.85607517, "learning_rate": 0.000578494156243549, "loss": 0.86752647, "num_input_tokens_seen": 202320096, "router_z_loss_mlp": 0.11083984, "routerloss_mlp": 0.0, "step": 2424, "time_per_iteration": 2.5783984661102295 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124685, "balance_loss_mlp": 1.1135745, "diversity_loss_mlp": 0.0, "epoch": 0.4665255867641401, "flos": 512623157760.0, "grad_norm": 0.06702614551613306, "language_loss": 0.88852286, "learning_rate": 0.0005781864616975878, "loss": 0.89976966, "num_input_tokens_seen": 202391552, "router_z_loss_mlp": 0.11108398, "routerloss_mlp": 0.0, "step": 2425, "time_per_iteration": 2.6615347862243652 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105595, "balance_loss_mlp": 1.09463954, "diversity_loss_mlp": 0.0, "epoch": 0.4667179684494036, "flos": 424812750336.0, "grad_norm": 0.0790317604017366, "language_loss": 0.84397781, "learning_rate": 0.0005778787367969502, "loss": 0.85503376, "num_input_tokens_seen": 202457328, "router_z_loss_mlp": 0.10961914, "routerloss_mlp": 0.0, "step": 2426, "time_per_iteration": 2.5796711444854736 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095822, "balance_loss_mlp": 1.08478928, "diversity_loss_mlp": 0.0, "epoch": 0.4669103501346672, "flos": 707956526592.0, "grad_norm": 0.062032004097500974, "language_loss": 0.80925953, "learning_rate": 0.0005775709816611053, "loss": 0.82021779, "num_input_tokens_seen": 202535888, "router_z_loss_mlp": 0.11029053, "routerloss_mlp": 0.0, "step": 2427, "time_per_iteration": 2.9491348266601562 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085219, "balance_loss_mlp": 1.07454419, "diversity_loss_mlp": 0.0, "epoch": 0.4671027318199307, "flos": 554832239616.0, "grad_norm": 0.0676389696771178, "language_loss": 0.83549029, "learning_rate": 0.0005772631964095346, "loss": 0.8463425, "num_input_tokens_seen": 202608400, "router_z_loss_mlp": 0.10681152, "routerloss_mlp": 0.0, "step": 2428, "time_per_iteration": 2.6981353759765625 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081501, "balance_loss_mlp": 1.07072484, "diversity_loss_mlp": 0.0, "epoch": 0.4672951135051943, "flos": 567109817856.0, "grad_norm": 0.08126061261115217, "language_loss": 0.8576231, "learning_rate": 0.000576955381161731, "loss": 0.86843812, "num_input_tokens_seen": 202677712, "router_z_loss_mlp": 0.10778809, "routerloss_mlp": 0.0, "step": 2429, "time_per_iteration": 2.6633517742156982 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074344, "balance_loss_mlp": 1.06313229, "diversity_loss_mlp": 0.0, "epoch": 0.46748749519045785, "flos": 424518713856.0, "grad_norm": 0.08275287351868318, "language_loss": 0.86212349, "learning_rate": 0.0005766475360371985, "loss": 0.87286699, "num_input_tokens_seen": 202743824, "router_z_loss_mlp": 0.11218262, "routerloss_mlp": 0.0, "step": 2430, "time_per_iteration": 2.5904853343963623 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072898, "balance_loss_mlp": 1.06205034, "diversity_loss_mlp": 0.0, "epoch": 0.46767987687572143, "flos": 538344548352.0, "grad_norm": 0.0860704645170746, "language_loss": 0.84563982, "learning_rate": 0.0005763396611554536, "loss": 0.85636878, "num_input_tokens_seen": 202813072, "router_z_loss_mlp": 0.10852051, "routerloss_mlp": 0.0, "step": 2431, "time_per_iteration": 2.6467607021331787 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071528, "balance_loss_mlp": 1.0607698, "diversity_loss_mlp": 0.0, "epoch": 0.467872258560985, "flos": 823702224384.0, "grad_norm": 0.08998246562287979, "language_loss": 0.80544329, "learning_rate": 0.0005760317566360237, "loss": 0.81615859, "num_input_tokens_seen": 202886576, "router_z_loss_mlp": 0.10760498, "routerloss_mlp": 0.0, "step": 2432, "time_per_iteration": 3.006641387939453 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075816, "balance_loss_mlp": 1.0648669, "diversity_loss_mlp": 0.0, "epoch": 0.46806464024624855, "flos": 661663240704.0, "grad_norm": 0.07509845156715887, "language_loss": 0.84929144, "learning_rate": 0.000575723822598448, "loss": 0.86004961, "num_input_tokens_seen": 202956736, "router_z_loss_mlp": 0.10961914, "routerloss_mlp": 0.0, "step": 2433, "time_per_iteration": 2.764425277709961 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067328, "balance_loss_mlp": 1.0558188, "diversity_loss_mlp": 0.0, "epoch": 0.46825702193151214, "flos": 755700171264.0, "grad_norm": 0.06651895210271294, "language_loss": 0.8167448, "learning_rate": 0.0005754158591622773, "loss": 0.82741809, "num_input_tokens_seen": 203036432, "router_z_loss_mlp": 0.1149292, "routerloss_mlp": 0.0, "step": 2434, "time_per_iteration": 2.9786107540130615 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075165, "balance_loss_mlp": 1.06366098, "diversity_loss_mlp": 0.0, "epoch": 0.4684494036167757, "flos": 439393061376.0, "grad_norm": 0.07251033111677281, "language_loss": 0.82255369, "learning_rate": 0.0005751078664470732, "loss": 0.83330536, "num_input_tokens_seen": 203101904, "router_z_loss_mlp": 0.11499023, "routerloss_mlp": 0.0, "step": 2435, "time_per_iteration": 2.5367684364318848 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079887, "balance_loss_mlp": 1.06816268, "diversity_loss_mlp": 0.0, "epoch": 0.46864178530203926, "flos": 532706098176.0, "grad_norm": 0.07721942828462902, "language_loss": 0.85977614, "learning_rate": 0.0005747998445724094, "loss": 0.87057501, "num_input_tokens_seen": 203170272, "router_z_loss_mlp": 0.11724854, "routerloss_mlp": 0.0, "step": 2436, "time_per_iteration": 2.636200189590454 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108497, "balance_loss_mlp": 1.07313251, "diversity_loss_mlp": 0.0, "epoch": 0.4688341669873028, "flos": 576627268608.0, "grad_norm": 0.07122055500535385, "language_loss": 0.89087129, "learning_rate": 0.0005744917936578707, "loss": 0.90172094, "num_input_tokens_seen": 203243920, "router_z_loss_mlp": 0.11828613, "routerloss_mlp": 0.0, "step": 2437, "time_per_iteration": 2.7820210456848145 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089973, "balance_loss_mlp": 1.07790279, "diversity_loss_mlp": 0.0, "epoch": 0.4690265486725664, "flos": 539579791872.0, "grad_norm": 0.0674848593159629, "language_loss": 0.84104413, "learning_rate": 0.0005741837138230526, "loss": 0.85194385, "num_input_tokens_seen": 203321760, "router_z_loss_mlp": 0.1206665, "routerloss_mlp": 0.0, "step": 2438, "time_per_iteration": 2.7324602603912354 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091997, "balance_loss_mlp": 1.07981968, "diversity_loss_mlp": 0.0, "epoch": 0.4692189303578299, "flos": 770510278656.0, "grad_norm": 0.08534673561441382, "language_loss": 0.86345065, "learning_rate": 0.0005738756051875627, "loss": 0.87437063, "num_input_tokens_seen": 203409088, "router_z_loss_mlp": 0.12176514, "routerloss_mlp": 0.0, "step": 2439, "time_per_iteration": 3.0705649852752686 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098053, "balance_loss_mlp": 1.08564377, "diversity_loss_mlp": 0.0, "epoch": 0.4694113120430935, "flos": 571396654080.0, "grad_norm": 0.06467123496854205, "language_loss": 0.83114249, "learning_rate": 0.0005735674678710192, "loss": 0.84212297, "num_input_tokens_seen": 203481680, "router_z_loss_mlp": 0.12414551, "routerloss_mlp": 0.0, "step": 2440, "time_per_iteration": 2.6645498275756836 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089062, "balance_loss_mlp": 1.07644403, "diversity_loss_mlp": 0.0, "epoch": 0.4696036937283571, "flos": 748816565760.0, "grad_norm": 0.09155388913703945, "language_loss": 0.81178355, "learning_rate": 0.0005732593019930517, "loss": 0.82267421, "num_input_tokens_seen": 203554848, "router_z_loss_mlp": 0.12628174, "routerloss_mlp": 0.0, "step": 2441, "time_per_iteration": 2.892775774002075 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084176, "balance_loss_mlp": 1.07203436, "diversity_loss_mlp": 0.0, "epoch": 0.4697960754136206, "flos": 493454633472.0, "grad_norm": 0.07090754106091501, "language_loss": 0.87927258, "learning_rate": 0.0005729511076733008, "loss": 0.89011431, "num_input_tokens_seen": 203624816, "router_z_loss_mlp": 0.12139893, "routerloss_mlp": 0.0, "step": 2442, "time_per_iteration": 2.629671096801758 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080039, "balance_loss_mlp": 1.06766534, "diversity_loss_mlp": 0.0, "epoch": 0.4699884570988842, "flos": 725118925824.0, "grad_norm": 0.0886658808398658, "language_loss": 0.85080904, "learning_rate": 0.000572642885031418, "loss": 0.86160946, "num_input_tokens_seen": 203698256, "router_z_loss_mlp": 0.1237793, "routerloss_mlp": 0.0, "step": 2443, "time_per_iteration": 2.858177900314331 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083351, "balance_loss_mlp": 1.07077432, "diversity_loss_mlp": 0.0, "epoch": 0.47018083878414774, "flos": 555427653120.0, "grad_norm": 0.06516149518751314, "language_loss": 0.80735445, "learning_rate": 0.0005723346341870662, "loss": 0.81818795, "num_input_tokens_seen": 203772672, "router_z_loss_mlp": 0.12573242, "routerloss_mlp": 0.0, "step": 2444, "time_per_iteration": 2.7146968841552734 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084098, "balance_loss_mlp": 1.07161689, "diversity_loss_mlp": 0.0, "epoch": 0.4703732204694113, "flos": 424069032960.0, "grad_norm": 0.08093347646647668, "language_loss": 0.86360067, "learning_rate": 0.0005720263552599188, "loss": 0.87444162, "num_input_tokens_seen": 203835904, "router_z_loss_mlp": 0.12493896, "routerloss_mlp": 0.0, "step": 2445, "time_per_iteration": 2.5240447521209717 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077035, "balance_loss_mlp": 1.06469131, "diversity_loss_mlp": 0.0, "epoch": 0.47056560215467486, "flos": 703494222336.0, "grad_norm": 0.10031003663616385, "language_loss": 0.80052316, "learning_rate": 0.0005717180483696604, "loss": 0.81129348, "num_input_tokens_seen": 203914704, "router_z_loss_mlp": 0.12347412, "routerloss_mlp": 0.0, "step": 2446, "time_per_iteration": 2.8576042652130127 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076731, "balance_loss_mlp": 1.06456566, "diversity_loss_mlp": 0.0, "epoch": 0.47075798383993844, "flos": 554963291136.0, "grad_norm": 0.06704052343949889, "language_loss": 0.82989585, "learning_rate": 0.0005714097136359862, "loss": 0.84066319, "num_input_tokens_seen": 203985072, "router_z_loss_mlp": 0.12164307, "routerloss_mlp": 0.0, "step": 2447, "time_per_iteration": 2.624566078186035 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00841696, "balance_loss_mlp": 1.45028305, "diversity_loss_mlp": 0.205522, "epoch": 0.470950365525202, "flos": 564305273856.0, "grad_norm": 0.027205551471082397, "language_loss": 0.86918223, "learning_rate": 0.0005711013511786027, "loss": 0.87759912, "num_input_tokens_seen": 204061904, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01379322, "step": 2448, "time_per_iteration": 2.797086238861084 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106901, "balance_loss_mlp": 1.05689788, "diversity_loss_mlp": 0.0, "epoch": 0.47114274721046556, "flos": 534450493440.0, "grad_norm": 0.06342125158561994, "language_loss": 0.83811176, "learning_rate": 0.0005707929611172263, "loss": 0.84880185, "num_input_tokens_seen": 204137392, "router_z_loss_mlp": 0.12103271, "routerloss_mlp": 0.0, "step": 2449, "time_per_iteration": 2.731825351715088 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071528, "balance_loss_mlp": 1.05951726, "diversity_loss_mlp": 0.0, "epoch": 0.47133512889572915, "flos": 473117303808.0, "grad_norm": 0.09170207604049842, "language_loss": 0.84256124, "learning_rate": 0.000570484543571585, "loss": 0.85327655, "num_input_tokens_seen": 204202752, "router_z_loss_mlp": 0.12011719, "routerloss_mlp": 0.0, "step": 2450, "time_per_iteration": 2.5735461711883545 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064618, "balance_loss_mlp": 1.05268502, "diversity_loss_mlp": 0.0, "epoch": 0.4715275105809927, "flos": 459013837824.0, "grad_norm": 0.08479509676509417, "language_loss": 0.82936448, "learning_rate": 0.0005701760986614171, "loss": 0.84001064, "num_input_tokens_seen": 204266960, "router_z_loss_mlp": 0.1192627, "routerloss_mlp": 0.0, "step": 2451, "time_per_iteration": 2.537297248840332 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071081, "balance_loss_mlp": 1.0591718, "diversity_loss_mlp": 0.0, "epoch": 0.47171989226625627, "flos": 422003437056.0, "grad_norm": 0.059658494784791405, "language_loss": 0.8734417, "learning_rate": 0.0005698676265064714, "loss": 0.88415247, "num_input_tokens_seen": 204331216, "router_z_loss_mlp": 0.11901855, "routerloss_mlp": 0.0, "step": 2452, "time_per_iteration": 2.5586979389190674 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076856, "balance_loss_mlp": 1.06525099, "diversity_loss_mlp": 0.0, "epoch": 0.4719122739515198, "flos": 457434998784.0, "grad_norm": 0.0707454592736124, "language_loss": 0.89208829, "learning_rate": 0.0005695591272265074, "loss": 0.90285689, "num_input_tokens_seen": 204397216, "router_z_loss_mlp": 0.1159668, "routerloss_mlp": 0.0, "step": 2453, "time_per_iteration": 2.527719736099243 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088311, "balance_loss_mlp": 1.07617581, "diversity_loss_mlp": 0.0, "epoch": 0.4721046556367834, "flos": 514975449600.0, "grad_norm": 0.07134640406799209, "language_loss": 0.81947398, "learning_rate": 0.0005692506009412954, "loss": 0.83035707, "num_input_tokens_seen": 204469952, "router_z_loss_mlp": 0.12133789, "routerloss_mlp": 0.0, "step": 2454, "time_per_iteration": 2.6558947563171387 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0064123, "balance_loss_mlp": 1.11988485, "diversity_loss_mlp": 0.13842735, "epoch": 0.4722970373220469, "flos": 1572258138624.0, "grad_norm": 0.002527541257966033, "language_loss": 0.7755127, "learning_rate": 0.0005689420477706156, "loss": 0.78192496, "num_input_tokens_seen": 204701152, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01207405, "step": 2455, "time_per_iteration": 5.005730628967285 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088969, "balance_loss_mlp": 1.07716715, "diversity_loss_mlp": 0.0, "epoch": 0.4724894190073105, "flos": 586214102016.0, "grad_norm": 0.07179176619920838, "language_loss": 0.89308333, "learning_rate": 0.0005686334678342593, "loss": 0.90397304, "num_input_tokens_seen": 204778144, "router_z_loss_mlp": 0.11804199, "routerloss_mlp": 0.0, "step": 2456, "time_per_iteration": 2.8779940605163574 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094143, "balance_loss_mlp": 1.08280611, "diversity_loss_mlp": 0.0, "epoch": 0.4726818006925741, "flos": 867645789696.0, "grad_norm": 0.08187467616753978, "language_loss": 0.81664062, "learning_rate": 0.0005683248612520274, "loss": 0.82758206, "num_input_tokens_seen": 204853376, "router_z_loss_mlp": 0.11334229, "routerloss_mlp": 0.0, "step": 2457, "time_per_iteration": 3.0844156742095947 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087436, "balance_loss_mlp": 1.07605195, "diversity_loss_mlp": 0.0, "epoch": 0.4728741823778376, "flos": 752967581184.0, "grad_norm": 0.08330432962991885, "language_loss": 0.83940041, "learning_rate": 0.0005680162281437321, "loss": 0.85027468, "num_input_tokens_seen": 204925280, "router_z_loss_mlp": 0.11383057, "routerloss_mlp": 0.0, "step": 2458, "time_per_iteration": 2.886364221572876 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108407, "balance_loss_mlp": 1.07263231, "diversity_loss_mlp": 0.0, "epoch": 0.4730665640631012, "flos": 538571773440.0, "grad_norm": 0.06607837126207569, "language_loss": 0.84340584, "learning_rate": 0.000567707568629195, "loss": 0.8542465, "num_input_tokens_seen": 205000592, "router_z_loss_mlp": 0.11425781, "routerloss_mlp": 0.0, "step": 2459, "time_per_iteration": 2.7153613567352295 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082795, "balance_loss_mlp": 1.0712074, "diversity_loss_mlp": 0.0, "epoch": 0.47325894574836475, "flos": 491653338624.0, "grad_norm": 0.0662532862091719, "language_loss": 0.82247961, "learning_rate": 0.0005673988828282486, "loss": 0.8333075, "num_input_tokens_seen": 205073968, "router_z_loss_mlp": 0.11584473, "routerloss_mlp": 0.0, "step": 2460, "time_per_iteration": 2.6740705966949463 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079264, "balance_loss_mlp": 1.06760526, "diversity_loss_mlp": 0.0, "epoch": 0.47345132743362833, "flos": 764459223552.0, "grad_norm": 0.05997115702153478, "language_loss": 0.81122911, "learning_rate": 0.0005670901708607352, "loss": 0.82202172, "num_input_tokens_seen": 205153536, "router_z_loss_mlp": 0.11645508, "routerloss_mlp": 0.0, "step": 2461, "time_per_iteration": 3.0222864151000977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077887, "balance_loss_mlp": 1.0661211, "diversity_loss_mlp": 0.0, "epoch": 0.47364370911889186, "flos": 540173007360.0, "grad_norm": 0.12722631062247966, "language_loss": 0.83784962, "learning_rate": 0.0005667814328465076, "loss": 0.84862852, "num_input_tokens_seen": 205220944, "router_z_loss_mlp": 0.11767578, "routerloss_mlp": 0.0, "step": 2462, "time_per_iteration": 2.62223744392395 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071899, "balance_loss_mlp": 1.06031179, "diversity_loss_mlp": 0.0, "epoch": 0.47383609080415545, "flos": 406219815936.0, "grad_norm": 0.10920156375550993, "language_loss": 0.82163846, "learning_rate": 0.0005664726689054285, "loss": 0.83235747, "num_input_tokens_seen": 205282688, "router_z_loss_mlp": 0.11578369, "routerloss_mlp": 0.0, "step": 2463, "time_per_iteration": 2.474776029586792 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072445, "balance_loss_mlp": 1.06096554, "diversity_loss_mlp": 0.0, "epoch": 0.474028472489419, "flos": 453476703744.0, "grad_norm": 0.07990467081118383, "language_loss": 0.80772603, "learning_rate": 0.0005661638791573704, "loss": 0.81845051, "num_input_tokens_seen": 205357360, "router_z_loss_mlp": 0.11474609, "routerloss_mlp": 0.0, "step": 2464, "time_per_iteration": 2.699165105819702 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073096, "balance_loss_mlp": 1.06145513, "diversity_loss_mlp": 0.0, "epoch": 0.47422085417468257, "flos": 492177171456.0, "grad_norm": 0.06593248790897067, "language_loss": 0.86978662, "learning_rate": 0.0005658550637222164, "loss": 0.8805176, "num_input_tokens_seen": 205424352, "router_z_loss_mlp": 0.11633301, "routerloss_mlp": 0.0, "step": 2465, "time_per_iteration": 2.6154093742370605 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070219, "balance_loss_mlp": 1.0586381, "diversity_loss_mlp": 0.0, "epoch": 0.47441323585994616, "flos": 738854203392.0, "grad_norm": 0.06422453310815268, "language_loss": 0.82103038, "learning_rate": 0.0005655462227198592, "loss": 0.83173257, "num_input_tokens_seen": 205502912, "router_z_loss_mlp": 0.11566162, "routerloss_mlp": 0.0, "step": 2466, "time_per_iteration": 2.888040065765381 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068955, "balance_loss_mlp": 1.05703366, "diversity_loss_mlp": 0.0, "epoch": 0.4746056175452097, "flos": 484685669376.0, "grad_norm": 0.07464863741428074, "language_loss": 0.84426093, "learning_rate": 0.0005652373562702016, "loss": 0.85495043, "num_input_tokens_seen": 205571168, "router_z_loss_mlp": 0.1192627, "routerloss_mlp": 0.0, "step": 2467, "time_per_iteration": 2.6240220069885254 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071196, "balance_loss_mlp": 1.05926943, "diversity_loss_mlp": 0.0, "epoch": 0.4747979992304733, "flos": 461052269568.0, "grad_norm": 0.06778780294468974, "language_loss": 0.88405621, "learning_rate": 0.000564928464493156, "loss": 0.89476824, "num_input_tokens_seen": 205639648, "router_z_loss_mlp": 0.11920166, "routerloss_mlp": 0.0, "step": 2468, "time_per_iteration": 2.598493814468384 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068768, "balance_loss_mlp": 1.05676329, "diversity_loss_mlp": 0.0, "epoch": 0.4749903809157368, "flos": 864431212032.0, "grad_norm": 0.06443301027733518, "language_loss": 0.81735635, "learning_rate": 0.000564619547508645, "loss": 0.82804406, "num_input_tokens_seen": 205721536, "router_z_loss_mlp": 0.11999512, "routerloss_mlp": 0.0, "step": 2469, "time_per_iteration": 4.510512828826904 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070291, "balance_loss_mlp": 1.05816698, "diversity_loss_mlp": 0.0, "epoch": 0.4751827626010004, "flos": 505546831872.0, "grad_norm": 0.0879456232971056, "language_loss": 0.82882106, "learning_rate": 0.0005643106054366008, "loss": 0.83952397, "num_input_tokens_seen": 205788512, "router_z_loss_mlp": 0.12121582, "routerloss_mlp": 0.0, "step": 2470, "time_per_iteration": 2.5648152828216553 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074496, "balance_loss_mlp": 1.06276536, "diversity_loss_mlp": 0.0, "epoch": 0.47537514428626393, "flos": 559388519424.0, "grad_norm": 0.06194770014341408, "language_loss": 0.79193991, "learning_rate": 0.000564001638396965, "loss": 0.8026849, "num_input_tokens_seen": 205863104, "router_z_loss_mlp": 0.11706543, "routerloss_mlp": 0.0, "step": 2471, "time_per_iteration": 2.7267987728118896 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073205, "balance_loss_mlp": 1.06152296, "diversity_loss_mlp": 0.0, "epoch": 0.4755675259715275, "flos": 834260000256.0, "grad_norm": 0.06505306942508977, "language_loss": 0.82164901, "learning_rate": 0.0005636926465096897, "loss": 0.83238107, "num_input_tokens_seen": 205940688, "router_z_loss_mlp": 0.11682129, "routerloss_mlp": 0.0, "step": 2472, "time_per_iteration": 3.035590887069702 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078551, "balance_loss_mlp": 1.06670165, "diversity_loss_mlp": 0.0, "epoch": 0.47575990765679105, "flos": 508237576704.0, "grad_norm": 0.08684318660371242, "language_loss": 0.8723672, "learning_rate": 0.0005633836298947363, "loss": 0.88315272, "num_input_tokens_seen": 206008352, "router_z_loss_mlp": 0.11846924, "routerloss_mlp": 0.0, "step": 2473, "time_per_iteration": 4.002026796340942 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091096, "balance_loss_mlp": 1.07912695, "diversity_loss_mlp": 0.0, "epoch": 0.47595228934205464, "flos": 591845211648.0, "grad_norm": 0.0706680414575132, "language_loss": 0.70566314, "learning_rate": 0.000563074588672075, "loss": 0.71657413, "num_input_tokens_seen": 206078240, "router_z_loss_mlp": 0.11950684, "routerloss_mlp": 0.0, "step": 2474, "time_per_iteration": 2.6985795497894287 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089769, "balance_loss_mlp": 1.07802129, "diversity_loss_mlp": 0.0, "epoch": 0.4761446710273182, "flos": 580607958528.0, "grad_norm": 0.06282750442858279, "language_loss": 0.85378051, "learning_rate": 0.0005627655229616868, "loss": 0.86467826, "num_input_tokens_seen": 206148896, "router_z_loss_mlp": 0.11743164, "routerloss_mlp": 0.0, "step": 2475, "time_per_iteration": 2.7580935955047607 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091111, "balance_loss_mlp": 1.07941031, "diversity_loss_mlp": 0.0, "epoch": 0.47633705271258175, "flos": 672893153280.0, "grad_norm": 0.07002888905047219, "language_loss": 0.90058106, "learning_rate": 0.0005624564328835616, "loss": 0.91149217, "num_input_tokens_seen": 206223792, "router_z_loss_mlp": 0.11682129, "routerloss_mlp": 0.0, "step": 2476, "time_per_iteration": 2.789257764816284 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108666, "balance_loss_mlp": 1.07509637, "diversity_loss_mlp": 0.0, "epoch": 0.47652943439784534, "flos": 541857931776.0, "grad_norm": 0.06042863191219761, "language_loss": 0.84203571, "learning_rate": 0.0005621473185576986, "loss": 0.85290229, "num_input_tokens_seen": 206299376, "router_z_loss_mlp": 0.11553955, "routerloss_mlp": 0.0, "step": 2477, "time_per_iteration": 2.724280834197998 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089922, "balance_loss_mlp": 1.07846594, "diversity_loss_mlp": 0.0, "epoch": 0.4767218160831089, "flos": 524819243520.0, "grad_norm": 0.07203405271885309, "language_loss": 0.87555075, "learning_rate": 0.0005618381801041068, "loss": 0.88644993, "num_input_tokens_seen": 206367936, "router_z_loss_mlp": 0.11450195, "routerloss_mlp": 0.0, "step": 2478, "time_per_iteration": 2.6800026893615723 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085708, "balance_loss_mlp": 1.0738883, "diversity_loss_mlp": 0.0, "epoch": 0.47691419776837246, "flos": 568056167424.0, "grad_norm": 0.08495018756940642, "language_loss": 0.83006722, "learning_rate": 0.0005615290176428044, "loss": 0.84092432, "num_input_tokens_seen": 206438864, "router_z_loss_mlp": 0.11810303, "routerloss_mlp": 0.0, "step": 2479, "time_per_iteration": 2.6456432342529297 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078377, "balance_loss_mlp": 1.06658673, "diversity_loss_mlp": 0.0, "epoch": 0.477106579453636, "flos": 530931967488.0, "grad_norm": 0.07371403414772894, "language_loss": 0.84979588, "learning_rate": 0.0005612198312938187, "loss": 0.86057961, "num_input_tokens_seen": 206516656, "router_z_loss_mlp": 0.11779785, "routerloss_mlp": 0.0, "step": 2480, "time_per_iteration": 2.7325923442840576 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085233, "balance_loss_mlp": 1.0737772, "diversity_loss_mlp": 0.0, "epoch": 0.4772989611388996, "flos": 594283765248.0, "grad_norm": 0.05926830515799366, "language_loss": 0.79493093, "learning_rate": 0.0005609106211771868, "loss": 0.80578327, "num_input_tokens_seen": 206595040, "router_z_loss_mlp": 0.11450195, "routerloss_mlp": 0.0, "step": 2481, "time_per_iteration": 2.8374931812286377 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108305, "balance_loss_mlp": 1.07103384, "diversity_loss_mlp": 0.0, "epoch": 0.4774913428241631, "flos": 544622828544.0, "grad_norm": 0.06643858588339867, "language_loss": 0.88938701, "learning_rate": 0.0005606013874129543, "loss": 0.90021759, "num_input_tokens_seen": 206670192, "router_z_loss_mlp": 0.12011719, "routerloss_mlp": 0.0, "step": 2482, "time_per_iteration": 2.7547929286956787 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081649, "balance_loss_mlp": 1.07017505, "diversity_loss_mlp": 0.0, "epoch": 0.4776837245094267, "flos": 540079031808.0, "grad_norm": 0.06416127972697647, "language_loss": 0.80410159, "learning_rate": 0.0005602921301211768, "loss": 0.81491804, "num_input_tokens_seen": 206746992, "router_z_loss_mlp": 0.11474609, "routerloss_mlp": 0.0, "step": 2483, "time_per_iteration": 2.7025153636932373 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080053, "balance_loss_mlp": 1.06850159, "diversity_loss_mlp": 0.0, "epoch": 0.4778761061946903, "flos": 471785513472.0, "grad_norm": 0.07652865967226291, "language_loss": 0.8209163, "learning_rate": 0.0005599828494219185, "loss": 0.83171678, "num_input_tokens_seen": 206813584, "router_z_loss_mlp": 0.11541748, "routerloss_mlp": 0.0, "step": 2484, "time_per_iteration": 2.5415024757385254 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070825, "balance_loss_mlp": 1.05903542, "diversity_loss_mlp": 0.0, "epoch": 0.4780684878799538, "flos": 726082527744.0, "grad_norm": 0.07721505579443601, "language_loss": 0.89162952, "learning_rate": 0.0005596735454352527, "loss": 0.90233779, "num_input_tokens_seen": 206885840, "router_z_loss_mlp": 0.11785889, "routerloss_mlp": 0.0, "step": 2485, "time_per_iteration": 2.8591346740722656 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077742, "balance_loss_mlp": 1.06591046, "diversity_loss_mlp": 0.0, "epoch": 0.4782608695652174, "flos": 548922147840.0, "grad_norm": 0.07819028279068943, "language_loss": 0.85696715, "learning_rate": 0.0005593642182812619, "loss": 0.86774457, "num_input_tokens_seen": 206955104, "router_z_loss_mlp": 0.1182251, "routerloss_mlp": 0.0, "step": 2486, "time_per_iteration": 2.679927349090576 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077454, "balance_loss_mlp": 1.06575358, "diversity_loss_mlp": 0.0, "epoch": 0.47845325125048094, "flos": 829923604992.0, "grad_norm": 0.0859238614993436, "language_loss": 0.83753216, "learning_rate": 0.0005590548680800378, "loss": 0.84830678, "num_input_tokens_seen": 207039792, "router_z_loss_mlp": 0.11694336, "routerloss_mlp": 0.0, "step": 2487, "time_per_iteration": 3.0984909534454346 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071222, "balance_loss_mlp": 1.05950415, "diversity_loss_mlp": 0.0, "epoch": 0.4786456329357445, "flos": 514164920832.0, "grad_norm": 0.06795851613398404, "language_loss": 0.76434267, "learning_rate": 0.0005587454949516804, "loss": 0.77505481, "num_input_tokens_seen": 207115632, "router_z_loss_mlp": 0.11712646, "routerloss_mlp": 0.0, "step": 2488, "time_per_iteration": 2.692324161529541 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107671, "balance_loss_mlp": 1.06507468, "diversity_loss_mlp": 0.0, "epoch": 0.47883801462100806, "flos": 564658781184.0, "grad_norm": 0.06921637005003253, "language_loss": 0.8785038, "learning_rate": 0.0005584360990162993, "loss": 0.88927084, "num_input_tokens_seen": 207184336, "router_z_loss_mlp": 0.11627197, "routerloss_mlp": 0.0, "step": 2489, "time_per_iteration": 2.646521806716919 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077817, "balance_loss_mlp": 1.06614649, "diversity_loss_mlp": 0.0, "epoch": 0.47903039630627164, "flos": 579577545216.0, "grad_norm": 0.06386300972416134, "language_loss": 0.85713631, "learning_rate": 0.0005581266803940124, "loss": 0.86791456, "num_input_tokens_seen": 207258720, "router_z_loss_mlp": 0.11657715, "routerloss_mlp": 0.0, "step": 2490, "time_per_iteration": 2.735152244567871 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070809, "balance_loss_mlp": 1.05925143, "diversity_loss_mlp": 0.0, "epoch": 0.47922277799153523, "flos": 618950149632.0, "grad_norm": 0.0718717211843218, "language_loss": 0.87536263, "learning_rate": 0.0005578172392049471, "loss": 0.88607073, "num_input_tokens_seen": 207329216, "router_z_loss_mlp": 0.11553955, "routerloss_mlp": 0.0, "step": 2491, "time_per_iteration": 2.7718377113342285 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00892921, "balance_loss_mlp": 1.54530287, "diversity_loss_mlp": 0.21191472, "epoch": 0.47941515967679876, "flos": 639653096448.0, "grad_norm": 0.033555176901221506, "language_loss": 0.84551859, "learning_rate": 0.0005575077755692386, "loss": 0.85444778, "num_input_tokens_seen": 207403712, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01431197, "step": 2492, "time_per_iteration": 2.81888747215271 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070402, "balance_loss_mlp": 1.05893993, "diversity_loss_mlp": 0.0, "epoch": 0.47960754136206235, "flos": 519823194624.0, "grad_norm": 0.054684262853474656, "language_loss": 0.86001486, "learning_rate": 0.0005571982896070316, "loss": 0.8707189, "num_input_tokens_seen": 207477120, "router_z_loss_mlp": 0.11456299, "routerloss_mlp": 0.0, "step": 2493, "time_per_iteration": 2.655311346054077 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084039, "balance_loss_mlp": 1.07248712, "diversity_loss_mlp": 0.0, "epoch": 0.4797999230473259, "flos": 475044507648.0, "grad_norm": 0.07545203546694841, "language_loss": 0.89854079, "learning_rate": 0.0005568887814384792, "loss": 0.90938115, "num_input_tokens_seen": 207544592, "router_z_loss_mlp": 0.11547852, "routerloss_mlp": 0.0, "step": 2494, "time_per_iteration": 2.5930681228637695 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082502, "balance_loss_mlp": 1.07098675, "diversity_loss_mlp": 0.0, "epoch": 0.47999230473258947, "flos": 532026620928.0, "grad_norm": 0.07194257940045806, "language_loss": 0.87281573, "learning_rate": 0.000556579251183743, "loss": 0.88364077, "num_input_tokens_seen": 207613808, "router_z_loss_mlp": 0.11517334, "routerloss_mlp": 0.0, "step": 2495, "time_per_iteration": 2.6386003494262695 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076942, "balance_loss_mlp": 1.06520605, "diversity_loss_mlp": 0.0, "epoch": 0.480184686417853, "flos": 601486373376.0, "grad_norm": 0.0750590648958695, "language_loss": 0.80158448, "learning_rate": 0.0005562696989629936, "loss": 0.81235385, "num_input_tokens_seen": 207684464, "router_z_loss_mlp": 0.11737061, "routerloss_mlp": 0.0, "step": 2496, "time_per_iteration": 2.7050864696502686 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00880705, "balance_loss_mlp": 1.52288473, "diversity_loss_mlp": 0.21003026, "epoch": 0.4803770681031166, "flos": 528196806144.0, "grad_norm": 0.02916103721032611, "language_loss": 0.82606125, "learning_rate": 0.0005559601248964095, "loss": 0.83486831, "num_input_tokens_seen": 207754016, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01424794, "step": 2497, "time_per_iteration": 2.6473939418792725 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085332, "balance_loss_mlp": 1.0741564, "diversity_loss_mlp": 0.0, "epoch": 0.4805694497883801, "flos": 511192622592.0, "grad_norm": 0.07410871061403823, "language_loss": 0.85882998, "learning_rate": 0.0005556505291041783, "loss": 0.86968333, "num_input_tokens_seen": 207827104, "router_z_loss_mlp": 0.11175537, "routerloss_mlp": 0.0, "step": 2498, "time_per_iteration": 2.665832042694092 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105898, "balance_loss_mlp": 1.09428692, "diversity_loss_mlp": 0.0, "epoch": 0.4807618314736437, "flos": 600342160896.0, "grad_norm": 0.06465509842390993, "language_loss": 0.84413946, "learning_rate": 0.0005553409117064954, "loss": 0.8551985, "num_input_tokens_seen": 207907824, "router_z_loss_mlp": 0.11608887, "routerloss_mlp": 0.0, "step": 2499, "time_per_iteration": 2.880300521850586 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00859857, "balance_loss_mlp": 1.48415303, "diversity_loss_mlp": 0.20870377, "epoch": 0.4809542131589073, "flos": 568965441024.0, "grad_norm": 0.02869897963967695, "language_loss": 0.84937358, "learning_rate": 0.0005550312728235654, "loss": 0.85797209, "num_input_tokens_seen": 207975632, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01342856, "step": 2500, "time_per_iteration": 2.7199203968048096 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109419, "balance_loss_mlp": 1.08251953, "diversity_loss_mlp": 0.0, "epoch": 0.4811465948441708, "flos": 575994779136.0, "grad_norm": 0.07331859457791397, "language_loss": 0.83879191, "learning_rate": 0.0005547216125756003, "loss": 0.84973377, "num_input_tokens_seen": 208048000, "router_z_loss_mlp": 0.11651611, "routerloss_mlp": 0.0, "step": 2501, "time_per_iteration": 2.732786178588867 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098928, "balance_loss_mlp": 1.08708501, "diversity_loss_mlp": 0.0, "epoch": 0.4813389765294344, "flos": 823865209344.0, "grad_norm": 0.07387575947985975, "language_loss": 0.82064617, "learning_rate": 0.0005544119310828211, "loss": 0.83163536, "num_input_tokens_seen": 208132592, "router_z_loss_mlp": 0.11846924, "routerloss_mlp": 0.0, "step": 2502, "time_per_iteration": 3.1029446125030518 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100673, "balance_loss_mlp": 1.08865714, "diversity_loss_mlp": 0.0, "epoch": 0.48153135821469795, "flos": 635531816448.0, "grad_norm": 0.06596898477591598, "language_loss": 0.84657413, "learning_rate": 0.0005541022284654568, "loss": 0.8575809, "num_input_tokens_seen": 208215824, "router_z_loss_mlp": 0.12017822, "routerloss_mlp": 0.0, "step": 2503, "time_per_iteration": 2.901026725769043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092163, "balance_loss_mlp": 1.08015907, "diversity_loss_mlp": 0.0, "epoch": 0.48172373989996153, "flos": 503701120512.0, "grad_norm": 0.0759157238743441, "language_loss": 0.83907866, "learning_rate": 0.0005537925048437446, "loss": 0.85000032, "num_input_tokens_seen": 208284304, "router_z_loss_mlp": 0.11987305, "routerloss_mlp": 0.0, "step": 2504, "time_per_iteration": 2.6014060974121094 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00594545, "balance_loss_mlp": 1.03097272, "diversity_loss_mlp": 0.13453583, "epoch": 0.48191612158522507, "flos": 1532362074624.0, "grad_norm": 0.0017952613590721677, "language_loss": 0.75751472, "learning_rate": 0.00055348276033793, "loss": 0.76346016, "num_input_tokens_seen": 208510224, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01179097, "step": 2505, "time_per_iteration": 4.960138320922852 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00867388, "balance_loss_mlp": 1.49711311, "diversity_loss_mlp": 0.20998067, "epoch": 0.48210850327048865, "flos": 702424161792.0, "grad_norm": 0.029195885141922995, "language_loss": 0.88189656, "learning_rate": 0.0005531729950682664, "loss": 0.8905704, "num_input_tokens_seen": 208596816, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01384138, "step": 2506, "time_per_iteration": 3.056671142578125 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082436, "balance_loss_mlp": 1.07027662, "diversity_loss_mlp": 0.0, "epoch": 0.4823008849557522, "flos": 439778502144.0, "grad_norm": 0.09591114443507165, "language_loss": 0.84746361, "learning_rate": 0.000552863209155015, "loss": 0.85828793, "num_input_tokens_seen": 208659616, "router_z_loss_mlp": 0.12158203, "routerloss_mlp": 0.0, "step": 2507, "time_per_iteration": 2.473930835723877 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00866012, "balance_loss_mlp": 1.49284506, "diversity_loss_mlp": 0.21081753, "epoch": 0.48249326664101577, "flos": 471859665408.0, "grad_norm": 0.03047035716712285, "language_loss": 0.82048851, "learning_rate": 0.0005525534027184461, "loss": 0.82914865, "num_input_tokens_seen": 208728080, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01418037, "step": 2508, "time_per_iteration": 2.5708260536193848 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078028, "balance_loss_mlp": 1.06624985, "diversity_loss_mlp": 0.0, "epoch": 0.48268564832627936, "flos": 563225674752.0, "grad_norm": 0.06261213728600334, "language_loss": 0.83131289, "learning_rate": 0.0005522435758788365, "loss": 0.84209323, "num_input_tokens_seen": 208803376, "router_z_loss_mlp": 0.11761475, "routerloss_mlp": 0.0, "step": 2509, "time_per_iteration": 2.7291650772094727 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00853572, "balance_loss_mlp": 1.46908307, "diversity_loss_mlp": 0.20966808, "epoch": 0.4828780300115429, "flos": 629606670336.0, "grad_norm": 0.03495470447814039, "language_loss": 0.80126894, "learning_rate": 0.0005519337287564721, "loss": 0.80980462, "num_input_tokens_seen": 208876656, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01419635, "step": 2510, "time_per_iteration": 2.843698024749756 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077046, "balance_loss_mlp": 1.06536365, "diversity_loss_mlp": 0.0, "epoch": 0.4830704116968065, "flos": 631850305536.0, "grad_norm": 0.07525780944119016, "language_loss": 0.83495927, "learning_rate": 0.000551623861471646, "loss": 0.84572971, "num_input_tokens_seen": 208950224, "router_z_loss_mlp": 0.11669922, "routerloss_mlp": 0.0, "step": 2511, "time_per_iteration": 2.7327091693878174 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01133891, "balance_loss_mlp": 1.1273582, "diversity_loss_mlp": 0.0, "epoch": 0.48326279338207, "flos": 1569268588032.0, "grad_norm": 0.052890092991212126, "language_loss": 0.78818834, "learning_rate": 0.0005513139741446594, "loss": 0.79952717, "num_input_tokens_seen": 209173984, "router_z_loss_mlp": 0.06542969, "routerloss_mlp": 0.0, "step": 2512, "time_per_iteration": 4.820046901702881 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073764, "balance_loss_mlp": 1.06182551, "diversity_loss_mlp": 0.0, "epoch": 0.4834551750673336, "flos": 509238254592.0, "grad_norm": 0.09417698665840035, "language_loss": 0.8670119, "learning_rate": 0.0005510040668958211, "loss": 0.87774956, "num_input_tokens_seen": 209242832, "router_z_loss_mlp": 0.1192627, "routerloss_mlp": 0.0, "step": 2513, "time_per_iteration": 2.579780101776123 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051826, "balance_loss_mlp": 1.04515004, "diversity_loss_mlp": 0.0, "epoch": 0.48364755675259713, "flos": 1528663311360.0, "grad_norm": 0.02705432320804172, "language_loss": 0.77760583, "learning_rate": 0.0005506941398454483, "loss": 0.78812408, "num_input_tokens_seen": 209473520, "router_z_loss_mlp": 0.06689453, "routerloss_mlp": 0.0, "step": 2514, "time_per_iteration": 4.83507227897644 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106953, "balance_loss_mlp": 1.05716157, "diversity_loss_mlp": 0.0, "epoch": 0.4838399384378607, "flos": 564989893632.0, "grad_norm": 0.07432123735470587, "language_loss": 0.83170015, "learning_rate": 0.0005503841931138645, "loss": 0.84239542, "num_input_tokens_seen": 209544208, "router_z_loss_mlp": 0.12365723, "routerloss_mlp": 0.0, "step": 2515, "time_per_iteration": 2.6834895610809326 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071737, "balance_loss_mlp": 1.05963731, "diversity_loss_mlp": 0.0, "epoch": 0.4840323201231243, "flos": 387691121664.0, "grad_norm": 0.07510504832931036, "language_loss": 0.81515384, "learning_rate": 0.0005500742268214025, "loss": 0.82587123, "num_input_tokens_seen": 209607408, "router_z_loss_mlp": 0.12091064, "routerloss_mlp": 0.0, "step": 2516, "time_per_iteration": 2.494479179382324 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084077, "balance_loss_mlp": 1.0715425, "diversity_loss_mlp": 0.0, "epoch": 0.48422470180838784, "flos": 630995360256.0, "grad_norm": 0.06432693662792612, "language_loss": 0.85142744, "learning_rate": 0.0005497642410884014, "loss": 0.86226821, "num_input_tokens_seen": 209683392, "router_z_loss_mlp": 0.12542725, "routerloss_mlp": 0.0, "step": 2517, "time_per_iteration": 2.760425090789795 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080501, "balance_loss_mlp": 1.06788325, "diversity_loss_mlp": 0.0, "epoch": 0.4844170834936514, "flos": 499226333184.0, "grad_norm": 0.06763953923030977, "language_loss": 0.85120749, "learning_rate": 0.0005494542360352085, "loss": 0.86201251, "num_input_tokens_seen": 209753184, "router_z_loss_mlp": 0.12628174, "routerloss_mlp": 0.0, "step": 2518, "time_per_iteration": 2.6524109840393066 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108191, "balance_loss_mlp": 1.06955993, "diversity_loss_mlp": 0.0, "epoch": 0.48460946517891496, "flos": 551076576768.0, "grad_norm": 0.06089591080825084, "language_loss": 0.85741639, "learning_rate": 0.0005491442117821783, "loss": 0.86823547, "num_input_tokens_seen": 209829568, "router_z_loss_mlp": 0.12353516, "routerloss_mlp": 0.0, "step": 2519, "time_per_iteration": 2.7461459636688232 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079216, "balance_loss_mlp": 1.06654429, "diversity_loss_mlp": 0.0, "epoch": 0.48480184686417854, "flos": 529390204416.0, "grad_norm": 0.07584750574127574, "language_loss": 0.87494171, "learning_rate": 0.0005488341684496732, "loss": 0.88573384, "num_input_tokens_seen": 209902176, "router_z_loss_mlp": 0.12677002, "routerloss_mlp": 0.0, "step": 2520, "time_per_iteration": 2.6621458530426025 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080655, "balance_loss_mlp": 1.06843615, "diversity_loss_mlp": 0.0, "epoch": 0.4849942285494421, "flos": 531912821760.0, "grad_norm": 0.06605179609441998, "language_loss": 0.9207437, "learning_rate": 0.0005485241061580624, "loss": 0.9315502, "num_input_tokens_seen": 209969168, "router_z_loss_mlp": 0.12213135, "routerloss_mlp": 0.0, "step": 2521, "time_per_iteration": 2.772949457168579 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089898, "balance_loss_mlp": 1.07741094, "diversity_loss_mlp": 0.0, "epoch": 0.48518661023470566, "flos": 722578682880.0, "grad_norm": 0.06556104217544546, "language_loss": 0.8458938, "learning_rate": 0.0005482140250277228, "loss": 0.85679281, "num_input_tokens_seen": 210049616, "router_z_loss_mlp": 0.12481689, "routerloss_mlp": 0.0, "step": 2522, "time_per_iteration": 2.978330135345459 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00847105, "balance_loss_mlp": 1.45509815, "diversity_loss_mlp": 0.21114388, "epoch": 0.4853789919199692, "flos": 506105169408.0, "grad_norm": 0.03368619412239962, "language_loss": 0.87090278, "learning_rate": 0.0005479039251790387, "loss": 0.87937379, "num_input_tokens_seen": 210118512, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01398425, "step": 2523, "time_per_iteration": 2.6939120292663574 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00840008, "balance_loss_mlp": 1.44148707, "diversity_loss_mlp": 0.21069397, "epoch": 0.4855713736052328, "flos": 660487094784.0, "grad_norm": 0.03188648694570784, "language_loss": 0.84722733, "learning_rate": 0.0005475938067324014, "loss": 0.85562754, "num_input_tokens_seen": 210193728, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0139178, "step": 2524, "time_per_iteration": 2.859184980392456 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106923, "balance_loss_mlp": 1.09528267, "diversity_loss_mlp": 0.0, "epoch": 0.48576375529049637, "flos": 436959277056.0, "grad_norm": 0.06962736532334403, "language_loss": 0.83518255, "learning_rate": 0.0005472836698082098, "loss": 0.84625173, "num_input_tokens_seen": 210258832, "router_z_loss_mlp": 0.11633301, "routerloss_mlp": 0.0, "step": 2525, "time_per_iteration": 2.534783363342285 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101033, "balance_loss_mlp": 1.08923149, "diversity_loss_mlp": 0.0, "epoch": 0.4859561369757599, "flos": 581707381248.0, "grad_norm": 0.07423434170097615, "language_loss": 0.84140873, "learning_rate": 0.0005469735145268694, "loss": 0.85241902, "num_input_tokens_seen": 210335280, "router_z_loss_mlp": 0.11798096, "routerloss_mlp": 0.0, "step": 2526, "time_per_iteration": 2.7064108848571777 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090982, "balance_loss_mlp": 1.07928169, "diversity_loss_mlp": 0.0, "epoch": 0.4861485186610235, "flos": 487964487168.0, "grad_norm": 0.0731540325655248, "language_loss": 0.81093931, "learning_rate": 0.0005466633410087933, "loss": 0.82184911, "num_input_tokens_seen": 210407072, "router_z_loss_mlp": 0.11688232, "routerloss_mlp": 0.0, "step": 2527, "time_per_iteration": 2.682969570159912 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085513, "balance_loss_mlp": 1.07793164, "diversity_loss_mlp": 0.0, "epoch": 0.486340900346287, "flos": 1557734727168.0, "grad_norm": 0.03711409557498352, "language_loss": 0.77260822, "learning_rate": 0.0005463531493744017, "loss": 0.78346336, "num_input_tokens_seen": 210644544, "router_z_loss_mlp": 0.07568359, "routerloss_mlp": 0.0, "step": 2528, "time_per_iteration": 4.962444067001343 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085086, "balance_loss_mlp": 1.07360601, "diversity_loss_mlp": 0.0, "epoch": 0.4865332820315506, "flos": 483005514240.0, "grad_norm": 0.07791605184695856, "language_loss": 0.88148236, "learning_rate": 0.0005460429397441214, "loss": 0.89233321, "num_input_tokens_seen": 210711760, "router_z_loss_mlp": 0.11468506, "routerloss_mlp": 0.0, "step": 2529, "time_per_iteration": 2.5908102989196777 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00835644, "balance_loss_mlp": 1.43002903, "diversity_loss_mlp": 0.21195745, "epoch": 0.48672566371681414, "flos": 535809447936.0, "grad_norm": 0.03186279831907627, "language_loss": 0.87013817, "learning_rate": 0.0005457327122383866, "loss": 0.87849462, "num_input_tokens_seen": 210783040, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01465126, "step": 2530, "time_per_iteration": 2.656264543533325 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01036926, "balance_loss_mlp": 1.02939153, "diversity_loss_mlp": 0.0, "epoch": 0.4869180454020777, "flos": 1412665422336.0, "grad_norm": 0.02373673385224348, "language_loss": 0.74636483, "learning_rate": 0.0005454224669776385, "loss": 0.75673413, "num_input_tokens_seen": 211002128, "router_z_loss_mlp": 0.07519531, "routerloss_mlp": 0.0, "step": 2531, "time_per_iteration": 4.838496208190918 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100935, "balance_loss_mlp": 1.08965194, "diversity_loss_mlp": 0.0, "epoch": 0.48711042708734126, "flos": 573113885184.0, "grad_norm": 0.06845758574896237, "language_loss": 0.75823385, "learning_rate": 0.0005451122040823244, "loss": 0.76924324, "num_input_tokens_seen": 211080080, "router_z_loss_mlp": 0.11279297, "routerloss_mlp": 0.0, "step": 2532, "time_per_iteration": 2.770751714706421 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099102, "balance_loss_mlp": 1.08746696, "diversity_loss_mlp": 0.0, "epoch": 0.48730280877260485, "flos": 626547737088.0, "grad_norm": 0.07387169787784394, "language_loss": 0.77164292, "learning_rate": 0.0005448019236728997, "loss": 0.7826339, "num_input_tokens_seen": 211162944, "router_z_loss_mlp": 0.11621094, "routerloss_mlp": 0.0, "step": 2533, "time_per_iteration": 2.8874497413635254 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00837303, "balance_loss_mlp": 1.43305767, "diversity_loss_mlp": 0.21233971, "epoch": 0.48749519045786843, "flos": 512479996416.0, "grad_norm": 0.03246629845535473, "language_loss": 0.8471576, "learning_rate": 0.0005444916258698255, "loss": 0.85553062, "num_input_tokens_seen": 211230448, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01460437, "step": 2534, "time_per_iteration": 2.623748540878296 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112009, "balance_loss_mlp": 1.10867584, "diversity_loss_mlp": 0.0, "epoch": 0.48768757214313196, "flos": 525414657024.0, "grad_norm": 0.06488105381348498, "language_loss": 0.86077154, "learning_rate": 0.0005441813107935704, "loss": 0.87197244, "num_input_tokens_seen": 211301248, "router_z_loss_mlp": 0.11407471, "routerloss_mlp": 0.0, "step": 2535, "time_per_iteration": 2.6705739498138428 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124443, "balance_loss_mlp": 1.11277819, "diversity_loss_mlp": 0.0, "epoch": 0.48787995382839555, "flos": 505032910848.0, "grad_norm": 0.07112550287999594, "language_loss": 0.86025345, "learning_rate": 0.0005438709785646091, "loss": 0.87149793, "num_input_tokens_seen": 211369888, "router_z_loss_mlp": 0.11651611, "routerloss_mlp": 0.0, "step": 2536, "time_per_iteration": 2.5624749660491943 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01120427, "balance_loss_mlp": 1.10864902, "diversity_loss_mlp": 0.0, "epoch": 0.4880723355136591, "flos": 575172140544.0, "grad_norm": 0.08492074314505418, "language_loss": 0.86885595, "learning_rate": 0.0005435606293034234, "loss": 0.8800602, "num_input_tokens_seen": 211441808, "router_z_loss_mlp": 0.11767578, "routerloss_mlp": 0.0, "step": 2537, "time_per_iteration": 2.6347479820251465 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121847, "balance_loss_mlp": 1.11035514, "diversity_loss_mlp": 0.0, "epoch": 0.48826471719892267, "flos": 561444203520.0, "grad_norm": 0.08214525409599778, "language_loss": 0.84619427, "learning_rate": 0.0005432502631305016, "loss": 0.8574127, "num_input_tokens_seen": 211511216, "router_z_loss_mlp": 0.11499023, "routerloss_mlp": 0.0, "step": 2538, "time_per_iteration": 2.700613021850586 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113422, "balance_loss_mlp": 1.10190618, "diversity_loss_mlp": 0.0, "epoch": 0.4884570988841862, "flos": 726188613120.0, "grad_norm": 0.06429037959601741, "language_loss": 0.83193302, "learning_rate": 0.0005429398801663386, "loss": 0.84306723, "num_input_tokens_seen": 211589264, "router_z_loss_mlp": 0.1151123, "routerloss_mlp": 0.0, "step": 2539, "time_per_iteration": 2.9839913845062256 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097658, "balance_loss_mlp": 1.08599913, "diversity_loss_mlp": 0.0, "epoch": 0.4886494805694498, "flos": 431019449856.0, "grad_norm": 0.12053819121868696, "language_loss": 0.8290484, "learning_rate": 0.0005426294805314355, "loss": 0.84002495, "num_input_tokens_seen": 211652928, "router_z_loss_mlp": 0.11651611, "routerloss_mlp": 0.0, "step": 2540, "time_per_iteration": 2.5029373168945312 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094803, "balance_loss_mlp": 1.08291781, "diversity_loss_mlp": 0.0, "epoch": 0.4888418622547134, "flos": 673006579200.0, "grad_norm": 0.06245664696917761, "language_loss": 0.80155998, "learning_rate": 0.0005423190643463003, "loss": 0.81250799, "num_input_tokens_seen": 211741664, "router_z_loss_mlp": 0.11883545, "routerloss_mlp": 0.0, "step": 2541, "time_per_iteration": 2.949772357940674 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01093208, "balance_loss_mlp": 1.08163261, "diversity_loss_mlp": 0.0, "epoch": 0.4890342439399769, "flos": 541897579008.0, "grad_norm": 0.07791209549750817, "language_loss": 0.8281579, "learning_rate": 0.0005420086317314473, "loss": 0.83908999, "num_input_tokens_seen": 211809136, "router_z_loss_mlp": 0.11572266, "routerloss_mlp": 0.0, "step": 2542, "time_per_iteration": 2.6383941173553467 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088553, "balance_loss_mlp": 1.0765729, "diversity_loss_mlp": 0.0, "epoch": 0.4892266256252405, "flos": 590676406272.0, "grad_norm": 0.06362759827284906, "language_loss": 0.81081557, "learning_rate": 0.0005416981828073971, "loss": 0.82170111, "num_input_tokens_seen": 211883136, "router_z_loss_mlp": 0.11981201, "routerloss_mlp": 0.0, "step": 2543, "time_per_iteration": 2.8023576736450195 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01007156, "balance_loss_mlp": 0.99990815, "diversity_loss_mlp": 0.0, "epoch": 0.48941900731050403, "flos": 1516296526848.0, "grad_norm": 0.01938913368632236, "language_loss": 0.77115011, "learning_rate": 0.0005413877176946765, "loss": 0.78122175, "num_input_tokens_seen": 212117488, "router_z_loss_mlp": 0.07226562, "routerloss_mlp": 0.0, "step": 2544, "time_per_iteration": 4.817458629608154 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01093446, "balance_loss_mlp": 1.08184147, "diversity_loss_mlp": 0.0, "epoch": 0.4896113889957676, "flos": 470564951040.0, "grad_norm": 0.08678858450341921, "language_loss": 0.84937072, "learning_rate": 0.000541077236513819, "loss": 0.86030519, "num_input_tokens_seen": 212181952, "router_z_loss_mlp": 0.11590576, "routerloss_mlp": 0.0, "step": 2545, "time_per_iteration": 2.5271120071411133 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089352, "balance_loss_mlp": 1.07800293, "diversity_loss_mlp": 0.0, "epoch": 0.48980377068103115, "flos": 496557983232.0, "grad_norm": 0.07207098978073255, "language_loss": 0.82449925, "learning_rate": 0.0005407667393853638, "loss": 0.83539271, "num_input_tokens_seen": 212252608, "router_z_loss_mlp": 0.11346436, "routerloss_mlp": 0.0, "step": 2546, "time_per_iteration": 2.6385204792022705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01093617, "balance_loss_mlp": 1.08250618, "diversity_loss_mlp": 0.0, "epoch": 0.48999615236629473, "flos": 692852382720.0, "grad_norm": 0.06843607218978102, "language_loss": 0.83673334, "learning_rate": 0.0005404562264298569, "loss": 0.84766948, "num_input_tokens_seen": 212328560, "router_z_loss_mlp": 0.11108398, "routerloss_mlp": 0.0, "step": 2547, "time_per_iteration": 2.845250368118286 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01102851, "balance_loss_mlp": 1.09120405, "diversity_loss_mlp": 0.0, "epoch": 0.49018853405155827, "flos": 541694946816.0, "grad_norm": 0.06940893068641271, "language_loss": 0.83999467, "learning_rate": 0.0005401456977678498, "loss": 0.8510232, "num_input_tokens_seen": 212399616, "router_z_loss_mlp": 0.11639404, "routerloss_mlp": 0.0, "step": 2548, "time_per_iteration": 2.638720750808716 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099322, "balance_loss_mlp": 1.08754444, "diversity_loss_mlp": 0.0, "epoch": 0.49038091573682185, "flos": 695663894016.0, "grad_norm": 0.08453175850654031, "language_loss": 0.77431965, "learning_rate": 0.0005398351535199008, "loss": 0.78531289, "num_input_tokens_seen": 212482352, "router_z_loss_mlp": 0.11773682, "routerloss_mlp": 0.0, "step": 2549, "time_per_iteration": 3.064035415649414 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103016, "balance_loss_mlp": 1.09175706, "diversity_loss_mlp": 0.0, "epoch": 0.49057329742208544, "flos": 596902929408.0, "grad_norm": 0.07238427843662706, "language_loss": 0.84189212, "learning_rate": 0.0005395245938065735, "loss": 0.85292226, "num_input_tokens_seen": 212559504, "router_z_loss_mlp": 0.11254883, "routerloss_mlp": 0.0, "step": 2550, "time_per_iteration": 2.7746829986572266 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01118468, "balance_loss_mlp": 1.10702372, "diversity_loss_mlp": 0.0, "epoch": 0.490765679107349, "flos": 513406522368.0, "grad_norm": 0.08583684211433391, "language_loss": 0.82631576, "learning_rate": 0.0005392140187484379, "loss": 0.83750039, "num_input_tokens_seen": 212625664, "router_z_loss_mlp": 0.11431885, "routerloss_mlp": 0.0, "step": 2551, "time_per_iteration": 2.582195281982422 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01124142, "balance_loss_mlp": 1.11273384, "diversity_loss_mlp": 0.0, "epoch": 0.49095806079261256, "flos": 629606670336.0, "grad_norm": 0.0682243054902728, "language_loss": 0.89719319, "learning_rate": 0.0005389034284660701, "loss": 0.90843463, "num_input_tokens_seen": 212702000, "router_z_loss_mlp": 0.11401367, "routerloss_mlp": 0.0, "step": 2552, "time_per_iteration": 2.824427366256714 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01131566, "balance_loss_mlp": 1.12022352, "diversity_loss_mlp": 0.0, "epoch": 0.4911504424778761, "flos": 915307941888.0, "grad_norm": 0.08386347311462448, "language_loss": 0.82537109, "learning_rate": 0.000538592823080052, "loss": 0.83668673, "num_input_tokens_seen": 212785376, "router_z_loss_mlp": 0.11340332, "routerloss_mlp": 0.0, "step": 2553, "time_per_iteration": 3.24122953414917 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01127167, "balance_loss_mlp": 1.11565781, "diversity_loss_mlp": 0.0, "epoch": 0.4913428241631397, "flos": 438943380480.0, "grad_norm": 0.06967590045443849, "language_loss": 0.84592807, "learning_rate": 0.000538282202710971, "loss": 0.85719973, "num_input_tokens_seen": 212848176, "router_z_loss_mlp": 0.11505127, "routerloss_mlp": 0.0, "step": 2554, "time_per_iteration": 2.5753910541534424 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01130476, "balance_loss_mlp": 1.11918652, "diversity_loss_mlp": 0.0, "epoch": 0.4915352058484032, "flos": 636092725248.0, "grad_norm": 0.07442252581599826, "language_loss": 0.82315147, "learning_rate": 0.000537971567479421, "loss": 0.83445626, "num_input_tokens_seen": 212917888, "router_z_loss_mlp": 0.11279297, "routerloss_mlp": 0.0, "step": 2555, "time_per_iteration": 2.7354228496551514 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01127557, "balance_loss_mlp": 1.11596429, "diversity_loss_mlp": 0.0, "epoch": 0.4917275875336668, "flos": 504518989824.0, "grad_norm": 0.09076326784032986, "language_loss": 0.88129175, "learning_rate": 0.0005376609175060011, "loss": 0.8925674, "num_input_tokens_seen": 212986288, "router_z_loss_mlp": 0.11584473, "routerloss_mlp": 0.0, "step": 2556, "time_per_iteration": 2.6124610900878906 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106232, "balance_loss_mlp": 1.09465659, "diversity_loss_mlp": 0.0, "epoch": 0.49191996921893033, "flos": 654547267584.0, "grad_norm": 0.07210041581715526, "language_loss": 0.80779845, "learning_rate": 0.0005373502529113162, "loss": 0.81886077, "num_input_tokens_seen": 213059504, "router_z_loss_mlp": 0.11572266, "routerloss_mlp": 0.0, "step": 2557, "time_per_iteration": 2.823993444442749 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100884, "balance_loss_mlp": 1.08888519, "diversity_loss_mlp": 0.0, "epoch": 0.4921123509041939, "flos": 492359980032.0, "grad_norm": 0.07460313059090624, "language_loss": 0.81449521, "learning_rate": 0.0005370395738159773, "loss": 0.82550406, "num_input_tokens_seen": 213129984, "router_z_loss_mlp": 0.11987305, "routerloss_mlp": 0.0, "step": 2558, "time_per_iteration": 2.6436777114868164 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00834873, "balance_loss_mlp": 1.42800272, "diversity_loss_mlp": 0.21467975, "epoch": 0.4923047325894575, "flos": 546167162880.0, "grad_norm": 0.03347414568603151, "language_loss": 0.82822633, "learning_rate": 0.0005367288803406003, "loss": 0.83657515, "num_input_tokens_seen": 213199184, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01353174, "step": 2559, "time_per_iteration": 2.662224531173706 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083349, "balance_loss_mlp": 1.07132101, "diversity_loss_mlp": 0.0, "epoch": 0.49249711427472104, "flos": 596473072128.0, "grad_norm": 0.0788259825299616, "language_loss": 0.818443, "learning_rate": 0.0005364181726058073, "loss": 0.82927656, "num_input_tokens_seen": 213272480, "router_z_loss_mlp": 0.12023926, "routerloss_mlp": 0.0, "step": 2560, "time_per_iteration": 2.686300277709961 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076289, "balance_loss_mlp": 1.06417727, "diversity_loss_mlp": 0.0, "epoch": 0.4926894959599846, "flos": 497825533440.0, "grad_norm": 0.07955060847799823, "language_loss": 0.8272332, "learning_rate": 0.0005361074507322261, "loss": 0.83799613, "num_input_tokens_seen": 213338704, "router_z_loss_mlp": 0.12103271, "routerloss_mlp": 0.0, "step": 2561, "time_per_iteration": 2.5809431076049805 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073469, "balance_loss_mlp": 1.06138754, "diversity_loss_mlp": 0.0, "epoch": 0.49288187764524816, "flos": 536130648576.0, "grad_norm": 0.07091460094801966, "language_loss": 0.81425411, "learning_rate": 0.000535796714840489, "loss": 0.82498884, "num_input_tokens_seen": 213406016, "router_z_loss_mlp": 0.12072754, "routerloss_mlp": 0.0, "step": 2562, "time_per_iteration": 2.6425187587738037 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073356, "balance_loss_mlp": 1.06107163, "diversity_loss_mlp": 0.0, "epoch": 0.49307425933051174, "flos": 641555707392.0, "grad_norm": 0.10871355986071002, "language_loss": 0.83800626, "learning_rate": 0.0005354859650512348, "loss": 0.84873986, "num_input_tokens_seen": 213474016, "router_z_loss_mlp": 0.12280273, "routerloss_mlp": 0.0, "step": 2563, "time_per_iteration": 2.7957375049591064 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074544, "balance_loss_mlp": 1.06282604, "diversity_loss_mlp": 0.0, "epoch": 0.4932666410157753, "flos": 516252911616.0, "grad_norm": 0.0798917687203661, "language_loss": 0.87428886, "learning_rate": 0.0005351752014851074, "loss": 0.88503432, "num_input_tokens_seen": 213539696, "router_z_loss_mlp": 0.11712646, "routerloss_mlp": 0.0, "step": 2564, "time_per_iteration": 2.6205673217773438 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085324, "balance_loss_mlp": 1.07352281, "diversity_loss_mlp": 0.0, "epoch": 0.49345902270103886, "flos": 601503625728.0, "grad_norm": 0.06874397476353511, "language_loss": 0.83621442, "learning_rate": 0.0005348644242627553, "loss": 0.84706771, "num_input_tokens_seen": 213609504, "router_z_loss_mlp": 0.11804199, "routerloss_mlp": 0.0, "step": 2565, "time_per_iteration": 2.7460625171661377 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010105, "balance_loss_mlp": 1.00411022, "diversity_loss_mlp": 0.0, "epoch": 0.49365140438630245, "flos": 1493673716736.0, "grad_norm": 0.013767653611631516, "language_loss": 0.75286627, "learning_rate": 0.0005345536335048336, "loss": 0.76297128, "num_input_tokens_seen": 213846064, "router_z_loss_mlp": 0.06396484, "routerloss_mlp": 0.0, "step": 2566, "time_per_iteration": 4.943475723266602 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110567, "balance_loss_mlp": 1.09899187, "diversity_loss_mlp": 0.0, "epoch": 0.493843786071566, "flos": 629599329792.0, "grad_norm": 0.08759046492811678, "language_loss": 0.81650245, "learning_rate": 0.0005342428293320013, "loss": 0.82760805, "num_input_tokens_seen": 213923216, "router_z_loss_mlp": 0.11572266, "routerloss_mlp": 0.0, "step": 2567, "time_per_iteration": 2.7889564037323 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01102659, "balance_loss_mlp": 1.09142327, "diversity_loss_mlp": 0.0, "epoch": 0.49403616775682957, "flos": 617564030976.0, "grad_norm": 0.07999691418133484, "language_loss": 0.8344667, "learning_rate": 0.0005339320118649238, "loss": 0.84549326, "num_input_tokens_seen": 213994096, "router_z_loss_mlp": 0.11230469, "routerloss_mlp": 0.0, "step": 2568, "time_per_iteration": 2.7774229049682617 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108192, "balance_loss_mlp": 1.09715271, "diversity_loss_mlp": 0.0, "epoch": 0.4942285494420931, "flos": 577647770112.0, "grad_norm": 0.07608170940546952, "language_loss": 0.86422324, "learning_rate": 0.000533621181224271, "loss": 0.87530512, "num_input_tokens_seen": 214069104, "router_z_loss_mlp": 0.1104126, "routerloss_mlp": 0.0, "step": 2569, "time_per_iteration": 2.7708005905151367 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095635, "balance_loss_mlp": 1.08442283, "diversity_loss_mlp": 0.0, "epoch": 0.4944209311273567, "flos": 630211995648.0, "grad_norm": 0.06858054906862693, "language_loss": 0.8138749, "learning_rate": 0.0005333103375307182, "loss": 0.82483125, "num_input_tokens_seen": 214150368, "router_z_loss_mlp": 0.11218262, "routerloss_mlp": 0.0, "step": 2570, "time_per_iteration": 2.8407034873962402 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090688, "balance_loss_mlp": 1.07972121, "diversity_loss_mlp": 0.0, "epoch": 0.4946133128126202, "flos": 587612703744.0, "grad_norm": 0.06174009778797697, "language_loss": 0.85711801, "learning_rate": 0.0005329994809049451, "loss": 0.86802495, "num_input_tokens_seen": 214220112, "router_z_loss_mlp": 0.10974121, "routerloss_mlp": 0.0, "step": 2571, "time_per_iteration": 2.7500712871551514 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01096363, "balance_loss_mlp": 1.08508563, "diversity_loss_mlp": 0.0, "epoch": 0.4948056944978838, "flos": 583718648832.0, "grad_norm": 0.06855083904022342, "language_loss": 0.88066995, "learning_rate": 0.0005326886114676375, "loss": 0.89163363, "num_input_tokens_seen": 214294480, "router_z_loss_mlp": 0.11279297, "routerloss_mlp": 0.0, "step": 2572, "time_per_iteration": 2.730137825012207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083866, "balance_loss_mlp": 1.07269001, "diversity_loss_mlp": 0.0, "epoch": 0.49499807618314734, "flos": 481822027776.0, "grad_norm": 0.06053914015656951, "language_loss": 0.88364595, "learning_rate": 0.0005323777293394854, "loss": 0.89448464, "num_input_tokens_seen": 214359568, "router_z_loss_mlp": 0.11181641, "routerloss_mlp": 0.0, "step": 2573, "time_per_iteration": 2.539825201034546 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084718, "balance_loss_mlp": 1.07365584, "diversity_loss_mlp": 0.0, "epoch": 0.4951904578684109, "flos": 518978161152.0, "grad_norm": 0.06797932871808014, "language_loss": 0.81904709, "learning_rate": 0.000532066834641184, "loss": 0.8298943, "num_input_tokens_seen": 214432032, "router_z_loss_mlp": 0.11065674, "routerloss_mlp": 0.0, "step": 2574, "time_per_iteration": 2.6663713455200195 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103824, "balance_loss_mlp": 1.09271336, "diversity_loss_mlp": 0.0, "epoch": 0.4953828395536745, "flos": 535505499648.0, "grad_norm": 0.07191084425213706, "language_loss": 0.85331243, "learning_rate": 0.0005317559274934334, "loss": 0.86435068, "num_input_tokens_seen": 214504096, "router_z_loss_mlp": 0.11114502, "routerloss_mlp": 0.0, "step": 2575, "time_per_iteration": 2.756410598754883 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097116, "balance_loss_mlp": 1.08592236, "diversity_loss_mlp": 0.0, "epoch": 0.49557522123893805, "flos": 528564994560.0, "grad_norm": 0.08893709148941176, "language_loss": 0.80365205, "learning_rate": 0.0005314450080169382, "loss": 0.81462318, "num_input_tokens_seen": 214575920, "router_z_loss_mlp": 0.11199951, "routerloss_mlp": 0.0, "step": 2576, "time_per_iteration": 2.613163471221924 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092174, "balance_loss_mlp": 1.0810523, "diversity_loss_mlp": 0.0, "epoch": 0.49576760292420163, "flos": 428007504384.0, "grad_norm": 0.10818754121519983, "language_loss": 0.8082127, "learning_rate": 0.0005311340763324083, "loss": 0.81913447, "num_input_tokens_seen": 214641664, "router_z_loss_mlp": 0.11126709, "routerloss_mlp": 0.0, "step": 2577, "time_per_iteration": 2.5670807361602783 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087439, "balance_loss_mlp": 1.07612574, "diversity_loss_mlp": 0.0, "epoch": 0.49595998460946517, "flos": 565236942336.0, "grad_norm": 0.07097138632102568, "language_loss": 0.82323599, "learning_rate": 0.0005308231325605578, "loss": 0.83411032, "num_input_tokens_seen": 214711744, "router_z_loss_mlp": 0.11315918, "routerloss_mlp": 0.0, "step": 2578, "time_per_iteration": 2.6519079208374023 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085757, "balance_loss_mlp": 1.07421172, "diversity_loss_mlp": 0.0, "epoch": 0.49615236629472875, "flos": 702490973184.0, "grad_norm": 0.06601832089031445, "language_loss": 0.76727217, "learning_rate": 0.0005305121768221061, "loss": 0.7781297, "num_input_tokens_seen": 214802256, "router_z_loss_mlp": 0.11535645, "routerloss_mlp": 0.0, "step": 2579, "time_per_iteration": 3.1306209564208984 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01040876, "balance_loss_mlp": 1.03489161, "diversity_loss_mlp": 0.0, "epoch": 0.4963447479799923, "flos": 1441665630720.0, "grad_norm": 0.022004289450105873, "language_loss": 0.75038326, "learning_rate": 0.000530201209237777, "loss": 0.76079202, "num_input_tokens_seen": 215023648, "router_z_loss_mlp": 0.05981445, "routerloss_mlp": 0.0, "step": 2580, "time_per_iteration": 4.8141255378723145 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079045, "balance_loss_mlp": 1.06767821, "diversity_loss_mlp": 0.0, "epoch": 0.49653712966525587, "flos": 537627995136.0, "grad_norm": 0.06618835036619775, "language_loss": 0.91614985, "learning_rate": 0.0005298902299282984, "loss": 0.92694032, "num_input_tokens_seen": 215094080, "router_z_loss_mlp": 0.11358643, "routerloss_mlp": 0.0, "step": 2581, "time_per_iteration": 2.586012125015259 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087579, "balance_loss_mlp": 1.07617044, "diversity_loss_mlp": 0.0, "epoch": 0.4967295113505194, "flos": 607280467968.0, "grad_norm": 0.07143589820149647, "language_loss": 0.84265745, "learning_rate": 0.0005295792390144033, "loss": 0.85353327, "num_input_tokens_seen": 215165456, "router_z_loss_mlp": 0.11407471, "routerloss_mlp": 0.0, "step": 2582, "time_per_iteration": 2.704911708831787 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01096311, "balance_loss_mlp": 1.08442605, "diversity_loss_mlp": 0.0, "epoch": 0.496921893035783, "flos": 474577574400.0, "grad_norm": 0.07556433689349051, "language_loss": 0.83576399, "learning_rate": 0.0005292682366168294, "loss": 0.84672707, "num_input_tokens_seen": 215229344, "router_z_loss_mlp": 0.11883545, "routerloss_mlp": 0.0, "step": 2583, "time_per_iteration": 2.5530638694763184 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105009, "balance_loss_mlp": 1.09309435, "diversity_loss_mlp": 0.0, "epoch": 0.4971142747210466, "flos": 597463838208.0, "grad_norm": 0.06699014279274042, "language_loss": 0.80089158, "learning_rate": 0.0005289572228563181, "loss": 0.81194162, "num_input_tokens_seen": 215305616, "router_z_loss_mlp": 0.11914062, "routerloss_mlp": 0.0, "step": 2584, "time_per_iteration": 2.729093551635742 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100707, "balance_loss_mlp": 1.08861935, "diversity_loss_mlp": 0.0, "epoch": 0.4973066564063101, "flos": 599603586048.0, "grad_norm": 0.0657007833960997, "language_loss": 0.83234823, "learning_rate": 0.000528646197853616, "loss": 0.8433553, "num_input_tokens_seen": 215378128, "router_z_loss_mlp": 0.12078857, "routerloss_mlp": 0.0, "step": 2585, "time_per_iteration": 2.727252721786499 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113697, "balance_loss_mlp": 1.10166335, "diversity_loss_mlp": 0.0, "epoch": 0.4974990380915737, "flos": 649474495488.0, "grad_norm": 0.07376563164337009, "language_loss": 0.85810697, "learning_rate": 0.0005283351617294735, "loss": 0.86924398, "num_input_tokens_seen": 215453536, "router_z_loss_mlp": 0.12023926, "routerloss_mlp": 0.0, "step": 2586, "time_per_iteration": 2.945610761642456 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01011716, "balance_loss_mlp": 1.00470638, "diversity_loss_mlp": 0.0, "epoch": 0.49769141977683723, "flos": 1529278548480.0, "grad_norm": 0.017193207514109847, "language_loss": 0.7663666, "learning_rate": 0.0005280241146046456, "loss": 0.77648377, "num_input_tokens_seen": 215689440, "router_z_loss_mlp": 0.0703125, "routerloss_mlp": 0.0, "step": 2587, "time_per_iteration": 5.038366079330444 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108852, "balance_loss_mlp": 1.07597303, "diversity_loss_mlp": 0.0, "epoch": 0.4978838014621008, "flos": 536370356736.0, "grad_norm": 0.06591325697086226, "language_loss": 0.86769819, "learning_rate": 0.0005277130565998916, "loss": 0.87858337, "num_input_tokens_seen": 215759600, "router_z_loss_mlp": 0.12554932, "routerloss_mlp": 0.0, "step": 2588, "time_per_iteration": 2.7726681232452393 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01086804, "balance_loss_mlp": 1.07443595, "diversity_loss_mlp": 0.0, "epoch": 0.49807618314736435, "flos": 539616867840.0, "grad_norm": 0.05822748641904789, "language_loss": 0.81899714, "learning_rate": 0.0005274019878359748, "loss": 0.82986516, "num_input_tokens_seen": 215833920, "router_z_loss_mlp": 0.12371826, "routerloss_mlp": 0.0, "step": 2589, "time_per_iteration": 2.733985424041748 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075103, "balance_loss_mlp": 1.06275249, "diversity_loss_mlp": 0.0, "epoch": 0.49826856483262794, "flos": 542475740160.0, "grad_norm": 0.0736619230298454, "language_loss": 0.87174684, "learning_rate": 0.0005270909084336628, "loss": 0.88249791, "num_input_tokens_seen": 215903616, "router_z_loss_mlp": 0.12335205, "routerloss_mlp": 0.0, "step": 2590, "time_per_iteration": 2.648728370666504 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075145, "balance_loss_mlp": 1.06231809, "diversity_loss_mlp": 0.0, "epoch": 0.4984609465178915, "flos": 522321219072.0, "grad_norm": 0.07329601175103365, "language_loss": 0.8877548, "learning_rate": 0.0005267798185137276, "loss": 0.89850616, "num_input_tokens_seen": 215974832, "router_z_loss_mlp": 0.12835693, "routerloss_mlp": 0.0, "step": 2591, "time_per_iteration": 2.616903066635132 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061242, "balance_loss_mlp": 1.04852843, "diversity_loss_mlp": 0.0, "epoch": 0.49865332820315506, "flos": 574544420352.0, "grad_norm": 0.0712913700859702, "language_loss": 0.89140213, "learning_rate": 0.0005264687181969444, "loss": 0.90201461, "num_input_tokens_seen": 216045024, "router_z_loss_mlp": 0.12713623, "routerloss_mlp": 0.0, "step": 2592, "time_per_iteration": 2.7121951580047607 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067017, "balance_loss_mlp": 1.05430353, "diversity_loss_mlp": 0.0, "epoch": 0.49884570988841864, "flos": 1013607115776.0, "grad_norm": 0.07969645648170227, "language_loss": 0.75208342, "learning_rate": 0.0005261576076040937, "loss": 0.76275361, "num_input_tokens_seen": 216129024, "router_z_loss_mlp": 0.12719727, "routerloss_mlp": 0.0, "step": 2593, "time_per_iteration": 3.248811721801758 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059604, "balance_loss_mlp": 1.04746807, "diversity_loss_mlp": 0.0, "epoch": 0.4990380915736822, "flos": 559581239808.0, "grad_norm": 0.07355463018535204, "language_loss": 0.84396625, "learning_rate": 0.0005258464868559591, "loss": 0.85456228, "num_input_tokens_seen": 216197648, "router_z_loss_mlp": 0.12121582, "routerloss_mlp": 0.0, "step": 2594, "time_per_iteration": 2.6535778045654297 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058063, "balance_loss_mlp": 1.0461601, "diversity_loss_mlp": 0.0, "epoch": 0.49923047325894576, "flos": 498954691584.0, "grad_norm": 0.06735340586139127, "language_loss": 0.88490266, "learning_rate": 0.0005255353560733284, "loss": 0.89548326, "num_input_tokens_seen": 216263904, "router_z_loss_mlp": 0.11907959, "routerloss_mlp": 0.0, "step": 2595, "time_per_iteration": 2.5711045265197754 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01040496, "balance_loss_mlp": 1.03453541, "diversity_loss_mlp": 0.0, "epoch": 0.4994228549442093, "flos": 1496636476416.0, "grad_norm": 0.025598241729826776, "language_loss": 0.75578642, "learning_rate": 0.0005252242153769931, "loss": 0.76619136, "num_input_tokens_seen": 216493152, "router_z_loss_mlp": 0.05957031, "routerloss_mlp": 0.0, "step": 2596, "time_per_iteration": 4.7992448806762695 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106698, "balance_loss_mlp": 1.05498767, "diversity_loss_mlp": 0.0, "epoch": 0.4996152366294729, "flos": 557374680576.0, "grad_norm": 0.07107233717475309, "language_loss": 0.83179224, "learning_rate": 0.0005249130648877492, "loss": 0.84246206, "num_input_tokens_seen": 216567216, "router_z_loss_mlp": 0.11987305, "routerloss_mlp": 0.0, "step": 2597, "time_per_iteration": 2.7089900970458984 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068426, "balance_loss_mlp": 1.05646324, "diversity_loss_mlp": 0.0, "epoch": 0.4998076183147364, "flos": 415594105344.0, "grad_norm": 0.08792128719199578, "language_loss": 0.84945238, "learning_rate": 0.0005246019047263953, "loss": 0.86013663, "num_input_tokens_seen": 216630624, "router_z_loss_mlp": 0.11962891, "routerloss_mlp": 0.0, "step": 2598, "time_per_iteration": 2.4586942195892334 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070932, "balance_loss_mlp": 1.0594883, "diversity_loss_mlp": 0.0, "epoch": 0.5, "flos": 467350373376.0, "grad_norm": 0.08031275074858332, "language_loss": 0.82562858, "learning_rate": 0.0005242907350137353, "loss": 0.83633792, "num_input_tokens_seen": 216696576, "router_z_loss_mlp": 0.11431885, "routerloss_mlp": 0.0, "step": 2599, "time_per_iteration": 2.547146797180176 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075807, "balance_loss_mlp": 1.06445217, "diversity_loss_mlp": 0.0, "epoch": 0.5001923816852636, "flos": 482718818304.0, "grad_norm": 0.08690624784708721, "language_loss": 0.79332286, "learning_rate": 0.0005239795558705754, "loss": 0.80408096, "num_input_tokens_seen": 216767584, "router_z_loss_mlp": 0.11358643, "routerloss_mlp": 0.0, "step": 2600, "time_per_iteration": 2.5985541343688965 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077027, "balance_loss_mlp": 1.06555915, "diversity_loss_mlp": 0.0, "epoch": 0.5003847633705272, "flos": 533798180352.0, "grad_norm": 0.06025548364908716, "language_loss": 0.89517641, "learning_rate": 0.0005236683674177264, "loss": 0.90594667, "num_input_tokens_seen": 216834320, "router_z_loss_mlp": 0.11462402, "routerloss_mlp": 0.0, "step": 2601, "time_per_iteration": 2.6358349323272705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090245, "balance_loss_mlp": 1.07874131, "diversity_loss_mlp": 0.0, "epoch": 0.5005771450557907, "flos": 737789285376.0, "grad_norm": 0.06252214062087984, "language_loss": 0.82497251, "learning_rate": 0.0005233571697760021, "loss": 0.83587497, "num_input_tokens_seen": 216907312, "router_z_loss_mlp": 0.11505127, "routerloss_mlp": 0.0, "step": 2602, "time_per_iteration": 2.8629817962646484 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01112229, "balance_loss_mlp": 1.10087442, "diversity_loss_mlp": 0.0, "epoch": 0.5007695267410542, "flos": 778977865728.0, "grad_norm": 0.06974132169475507, "language_loss": 0.8293485, "learning_rate": 0.0005230459630662203, "loss": 0.84047079, "num_input_tokens_seen": 216979872, "router_z_loss_mlp": 0.11352539, "routerloss_mlp": 0.0, "step": 2603, "time_per_iteration": 2.939380168914795 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01114631, "balance_loss_mlp": 1.10359812, "diversity_loss_mlp": 0.0, "epoch": 0.5009619084263178, "flos": 623476694016.0, "grad_norm": 0.10511771954620508, "language_loss": 0.81605637, "learning_rate": 0.0005227347474092022, "loss": 0.82720268, "num_input_tokens_seen": 217054000, "router_z_loss_mlp": 0.11035156, "routerloss_mlp": 0.0, "step": 2604, "time_per_iteration": 2.7169747352600098 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0112322, "balance_loss_mlp": 1.11197877, "diversity_loss_mlp": 0.0, "epoch": 0.5011542901115814, "flos": 531087611904.0, "grad_norm": 0.07495893748856379, "language_loss": 0.83243322, "learning_rate": 0.0005224235229257724, "loss": 0.84366548, "num_input_tokens_seen": 217126784, "router_z_loss_mlp": 0.11236572, "routerloss_mlp": 0.0, "step": 2605, "time_per_iteration": 2.6940438747406006 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113092, "balance_loss_mlp": 1.10178471, "diversity_loss_mlp": 0.0, "epoch": 0.5013466717968449, "flos": 527534581248.0, "grad_norm": 0.06884013858989874, "language_loss": 0.86851203, "learning_rate": 0.0005221122897367589, "loss": 0.87964296, "num_input_tokens_seen": 217203056, "router_z_loss_mlp": 0.11309814, "routerloss_mlp": 0.0, "step": 2606, "time_per_iteration": 2.800685405731201 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01109504, "balance_loss_mlp": 1.09854841, "diversity_loss_mlp": 0.0, "epoch": 0.5015390534821085, "flos": 566017735680.0, "grad_norm": 0.08142217271827161, "language_loss": 0.81335354, "learning_rate": 0.0005218010479629932, "loss": 0.82444859, "num_input_tokens_seen": 217273280, "router_z_loss_mlp": 0.10961914, "routerloss_mlp": 0.0, "step": 2607, "time_per_iteration": 2.657087564468384 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098904, "balance_loss_mlp": 1.08753133, "diversity_loss_mlp": 0.0, "epoch": 0.5017314351673721, "flos": 566697212928.0, "grad_norm": 0.08269023882009051, "language_loss": 0.82140303, "learning_rate": 0.0005214897977253102, "loss": 0.83239204, "num_input_tokens_seen": 217345568, "router_z_loss_mlp": 0.11364746, "routerloss_mlp": 0.0, "step": 2608, "time_per_iteration": 2.649846076965332 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084998, "balance_loss_mlp": 1.07372093, "diversity_loss_mlp": 0.0, "epoch": 0.5019238168526357, "flos": 522291483648.0, "grad_norm": 0.061165709745894754, "language_loss": 0.84233439, "learning_rate": 0.0005211785391445473, "loss": 0.8531844, "num_input_tokens_seen": 217422848, "router_z_loss_mlp": 0.11279297, "routerloss_mlp": 0.0, "step": 2609, "time_per_iteration": 2.7179222106933594 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087043, "balance_loss_mlp": 1.07538986, "diversity_loss_mlp": 0.0, "epoch": 0.5021161985378992, "flos": 641434567680.0, "grad_norm": 0.06641391212047838, "language_loss": 0.79080439, "learning_rate": 0.0005208672723415467, "loss": 0.80167478, "num_input_tokens_seen": 217502896, "router_z_loss_mlp": 0.11639404, "routerloss_mlp": 0.0, "step": 2610, "time_per_iteration": 2.7928884029388428 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085289, "balance_loss_mlp": 1.07359457, "diversity_loss_mlp": 0.0, "epoch": 0.5023085802231627, "flos": 591284302848.0, "grad_norm": 0.07063839016412009, "language_loss": 0.79436052, "learning_rate": 0.0005205559974371525, "loss": 0.80521345, "num_input_tokens_seen": 217575072, "router_z_loss_mlp": 0.11682129, "routerloss_mlp": 0.0, "step": 2611, "time_per_iteration": 2.75744366645813 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085649, "balance_loss_mlp": 1.07412767, "diversity_loss_mlp": 0.0, "epoch": 0.5025009619084263, "flos": 472373586432.0, "grad_norm": 0.06307258943078059, "language_loss": 0.82345438, "learning_rate": 0.0005202447145522123, "loss": 0.83431089, "num_input_tokens_seen": 217644976, "router_z_loss_mlp": 0.1151123, "routerloss_mlp": 0.0, "step": 2612, "time_per_iteration": 2.6847879886627197 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084149, "balance_loss_mlp": 1.07245421, "diversity_loss_mlp": 0.0, "epoch": 0.5026933435936899, "flos": 455139606528.0, "grad_norm": 0.060686478103186246, "language_loss": 0.79358983, "learning_rate": 0.0005199334238075769, "loss": 0.80443138, "num_input_tokens_seen": 217712816, "router_z_loss_mlp": 0.11682129, "routerloss_mlp": 0.0, "step": 2613, "time_per_iteration": 2.560041666030884 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084812, "balance_loss_mlp": 1.07277226, "diversity_loss_mlp": 0.0, "epoch": 0.5028857252789535, "flos": 491747314176.0, "grad_norm": 0.086387426867178, "language_loss": 0.91963339, "learning_rate": 0.0005196221253241, "loss": 0.93048155, "num_input_tokens_seen": 217780256, "router_z_loss_mlp": 0.12030029, "routerloss_mlp": 0.0, "step": 2614, "time_per_iteration": 2.6397578716278076 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107839, "balance_loss_mlp": 1.06617713, "diversity_loss_mlp": 0.0, "epoch": 0.503078106964217, "flos": 625569454080.0, "grad_norm": 0.09198716130289855, "language_loss": 0.82890773, "learning_rate": 0.0005193108192226383, "loss": 0.83969164, "num_input_tokens_seen": 217848496, "router_z_loss_mlp": 0.12213135, "routerloss_mlp": 0.0, "step": 2615, "time_per_iteration": 2.7370193004608154 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076176, "balance_loss_mlp": 1.06396329, "diversity_loss_mlp": 0.0, "epoch": 0.5032704886494805, "flos": 579046371840.0, "grad_norm": 0.08941342921082604, "language_loss": 0.86907744, "learning_rate": 0.000518999505624052, "loss": 0.87983918, "num_input_tokens_seen": 217919216, "router_z_loss_mlp": 0.12213135, "routerloss_mlp": 0.0, "step": 2616, "time_per_iteration": 2.733515739440918 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067104, "balance_loss_mlp": 1.05521274, "diversity_loss_mlp": 0.0, "epoch": 0.5034628703347441, "flos": 471753206784.0, "grad_norm": 0.05504525356098391, "language_loss": 0.83447164, "learning_rate": 0.000518688184649203, "loss": 0.84514272, "num_input_tokens_seen": 217996096, "router_z_loss_mlp": 0.11883545, "routerloss_mlp": 0.0, "step": 2617, "time_per_iteration": 2.816542625427246 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075166, "balance_loss_mlp": 1.06264269, "diversity_loss_mlp": 0.0, "epoch": 0.5036552520200077, "flos": 489837362688.0, "grad_norm": 0.07489503160460931, "language_loss": 0.83596766, "learning_rate": 0.0005183768564189577, "loss": 0.84671938, "num_input_tokens_seen": 218063072, "router_z_loss_mlp": 0.12524414, "routerloss_mlp": 0.0, "step": 2618, "time_per_iteration": 2.5781893730163574 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081949, "balance_loss_mlp": 1.07029045, "diversity_loss_mlp": 0.0, "epoch": 0.5038476337052713, "flos": 494235426816.0, "grad_norm": 0.0695581827230682, "language_loss": 0.81485611, "learning_rate": 0.0005180655210541838, "loss": 0.82567555, "num_input_tokens_seen": 218131056, "router_z_loss_mlp": 0.11651611, "routerloss_mlp": 0.0, "step": 2619, "time_per_iteration": 2.5642077922821045 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091231, "balance_loss_mlp": 1.07894695, "diversity_loss_mlp": 0.0, "epoch": 0.5040400153905348, "flos": 600604263936.0, "grad_norm": 0.08072673001204132, "language_loss": 0.83226323, "learning_rate": 0.0005177541786757527, "loss": 0.84317553, "num_input_tokens_seen": 218203536, "router_z_loss_mlp": 0.1227417, "routerloss_mlp": 0.0, "step": 2620, "time_per_iteration": 2.7365450859069824 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100722, "balance_loss_mlp": 1.0882231, "diversity_loss_mlp": 0.0, "epoch": 0.5042323970757984, "flos": 811525962240.0, "grad_norm": 0.0921594393427519, "language_loss": 0.82626402, "learning_rate": 0.000517442829404538, "loss": 0.83727121, "num_input_tokens_seen": 218283008, "router_z_loss_mlp": 0.12493896, "routerloss_mlp": 0.0, "step": 2621, "time_per_iteration": 3.053333044052124 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097629, "balance_loss_mlp": 1.08534431, "diversity_loss_mlp": 0.0, "epoch": 0.504424778761062, "flos": 627308706816.0, "grad_norm": 0.0844592365120011, "language_loss": 0.87026393, "learning_rate": 0.0005171314733614166, "loss": 0.88124025, "num_input_tokens_seen": 218362096, "router_z_loss_mlp": 0.12286377, "routerloss_mlp": 0.0, "step": 2622, "time_per_iteration": 2.8867554664611816 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099715, "balance_loss_mlp": 1.08721614, "diversity_loss_mlp": 0.0, "epoch": 0.5046171604463255, "flos": 515911887360.0, "grad_norm": 0.07191738026805333, "language_loss": 0.78457403, "learning_rate": 0.0005168201106672671, "loss": 0.79557121, "num_input_tokens_seen": 218439440, "router_z_loss_mlp": 0.125, "routerloss_mlp": 0.0, "step": 2623, "time_per_iteration": 2.7532849311828613 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083535, "balance_loss_mlp": 1.07122076, "diversity_loss_mlp": 0.0, "epoch": 0.504809542131589, "flos": 527831188992.0, "grad_norm": 0.06664161086213699, "language_loss": 0.84876573, "learning_rate": 0.0005165087414429717, "loss": 0.85960108, "num_input_tokens_seen": 218505936, "router_z_loss_mlp": 0.12316895, "routerloss_mlp": 0.0, "step": 2624, "time_per_iteration": 2.614475965499878 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073648, "balance_loss_mlp": 1.061566, "diversity_loss_mlp": 0.0, "epoch": 0.5050019238168526, "flos": 554118257664.0, "grad_norm": 0.06712294156504883, "language_loss": 0.83509946, "learning_rate": 0.0005161973658094144, "loss": 0.84583604, "num_input_tokens_seen": 218573824, "router_z_loss_mlp": 0.12072754, "routerloss_mlp": 0.0, "step": 2625, "time_per_iteration": 2.6536033153533936 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00875819, "balance_loss_mlp": 1.51064336, "diversity_loss_mlp": 0.21324398, "epoch": 0.5051943055021162, "flos": 574774216704.0, "grad_norm": 0.02954045761884847, "language_loss": 0.82599998, "learning_rate": 0.000515885983887482, "loss": 0.83475816, "num_input_tokens_seen": 218648016, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01387555, "step": 2626, "time_per_iteration": 2.801612138748169 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070563, "balance_loss_mlp": 1.05863595, "diversity_loss_mlp": 0.0, "epoch": 0.5053866871873798, "flos": 496686463488.0, "grad_norm": 0.07357396162877478, "language_loss": 0.84283531, "learning_rate": 0.0005155745957980636, "loss": 0.8535409, "num_input_tokens_seen": 218714128, "router_z_loss_mlp": 0.11920166, "routerloss_mlp": 0.0, "step": 2627, "time_per_iteration": 2.6239585876464844 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071483, "balance_loss_mlp": 1.0589962, "diversity_loss_mlp": 0.0, "epoch": 0.5055790688726434, "flos": 502213685760.0, "grad_norm": 0.06901961430938243, "language_loss": 0.88532668, "learning_rate": 0.000515263201662051, "loss": 0.89604151, "num_input_tokens_seen": 218784800, "router_z_loss_mlp": 0.12493896, "routerloss_mlp": 0.0, "step": 2628, "time_per_iteration": 2.65803861618042 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107038, "balance_loss_mlp": 1.05840504, "diversity_loss_mlp": 0.0, "epoch": 0.5057714505579068, "flos": 845227809792.0, "grad_norm": 0.06314416177701848, "language_loss": 0.8250618, "learning_rate": 0.0005149518016003378, "loss": 0.8357656, "num_input_tokens_seen": 218868256, "router_z_loss_mlp": 0.11968994, "routerloss_mlp": 0.0, "step": 2629, "time_per_iteration": 3.1646623611450195 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061865, "balance_loss_mlp": 1.04946709, "diversity_loss_mlp": 0.0, "epoch": 0.5059638322431704, "flos": 497825533440.0, "grad_norm": 0.1007750022567515, "language_loss": 0.82337832, "learning_rate": 0.0005146403957338206, "loss": 0.83399695, "num_input_tokens_seen": 218932496, "router_z_loss_mlp": 0.12402344, "routerloss_mlp": 0.0, "step": 2630, "time_per_iteration": 2.5879476070404053 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064427, "balance_loss_mlp": 1.05236936, "diversity_loss_mlp": 0.0, "epoch": 0.506156213928434, "flos": 617843013120.0, "grad_norm": 0.06667308072604639, "language_loss": 0.82288837, "learning_rate": 0.0005143289841833975, "loss": 0.83353263, "num_input_tokens_seen": 219010672, "router_z_loss_mlp": 0.12060547, "routerloss_mlp": 0.0, "step": 2631, "time_per_iteration": 2.8448615074157715 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068448, "balance_loss_mlp": 1.05643749, "diversity_loss_mlp": 0.0, "epoch": 0.5063485956136976, "flos": 424857166848.0, "grad_norm": 0.09203997555384738, "language_loss": 0.82179189, "learning_rate": 0.0005140175670699696, "loss": 0.83247638, "num_input_tokens_seen": 219077104, "router_z_loss_mlp": 0.11999512, "routerloss_mlp": 0.0, "step": 2632, "time_per_iteration": 2.642666816711426 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067258, "balance_loss_mlp": 1.05545044, "diversity_loss_mlp": 0.0, "epoch": 0.5065409772989612, "flos": 569926471680.0, "grad_norm": 0.04894531982576629, "language_loss": 0.82796603, "learning_rate": 0.0005137061445144395, "loss": 0.8386386, "num_input_tokens_seen": 219164880, "router_z_loss_mlp": 0.11804199, "routerloss_mlp": 0.0, "step": 2633, "time_per_iteration": 2.8800737857818604 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076133, "balance_loss_mlp": 1.06476033, "diversity_loss_mlp": 0.0, "epoch": 0.5067333589842247, "flos": 628801284096.0, "grad_norm": 0.06583044180155191, "language_loss": 0.87074906, "learning_rate": 0.000513394716637712, "loss": 0.88151038, "num_input_tokens_seen": 219237376, "router_z_loss_mlp": 0.1137085, "routerloss_mlp": 0.0, "step": 2634, "time_per_iteration": 2.7507505416870117 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01035986, "balance_loss_mlp": 1.02921486, "diversity_loss_mlp": 0.0, "epoch": 0.5069257406694883, "flos": 1447867187712.0, "grad_norm": 0.03533282921310782, "language_loss": 0.79191709, "learning_rate": 0.0005130832835606946, "loss": 0.80227697, "num_input_tokens_seen": 219467632, "router_z_loss_mlp": 0.06787109, "routerloss_mlp": 0.0, "step": 2635, "time_per_iteration": 4.825605869293213 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110028, "balance_loss_mlp": 1.08881176, "diversity_loss_mlp": 0.0, "epoch": 0.5071181223547518, "flos": 638835227136.0, "grad_norm": 0.07735545811428028, "language_loss": 0.81068468, "learning_rate": 0.0005127718454042958, "loss": 0.82168746, "num_input_tokens_seen": 219545392, "router_z_loss_mlp": 0.11462402, "routerloss_mlp": 0.0, "step": 2636, "time_per_iteration": 2.8241050243377686 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099835, "balance_loss_mlp": 1.08840299, "diversity_loss_mlp": 0.0, "epoch": 0.5073105040400154, "flos": 713565241344.0, "grad_norm": 0.08187506034762644, "language_loss": 0.83836603, "learning_rate": 0.0005124604022894269, "loss": 0.8493644, "num_input_tokens_seen": 219623104, "router_z_loss_mlp": 0.11425781, "routerloss_mlp": 0.0, "step": 2637, "time_per_iteration": 2.9366774559020996 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01019034, "balance_loss_mlp": 1.01259708, "diversity_loss_mlp": 0.0, "epoch": 0.5075028857252789, "flos": 1436447126016.0, "grad_norm": 0.025963071476552062, "language_loss": 0.77188224, "learning_rate": 0.000512148954337001, "loss": 0.7820726, "num_input_tokens_seen": 219853328, "router_z_loss_mlp": 0.06445312, "routerloss_mlp": 0.0, "step": 2638, "time_per_iteration": 4.828620433807373 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092892, "balance_loss_mlp": 1.08166814, "diversity_loss_mlp": 0.0, "epoch": 0.5076952674105425, "flos": 571147034112.0, "grad_norm": 0.07837351333742608, "language_loss": 0.83244252, "learning_rate": 0.0005118375016679325, "loss": 0.84337139, "num_input_tokens_seen": 219925024, "router_z_loss_mlp": 0.11224365, "routerloss_mlp": 0.0, "step": 2639, "time_per_iteration": 2.801852226257324 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077953, "balance_loss_mlp": 1.0666697, "diversity_loss_mlp": 0.0, "epoch": 0.5078876490958061, "flos": 516712504320.0, "grad_norm": 0.07879033409242599, "language_loss": 0.80358827, "learning_rate": 0.0005115260444031382, "loss": 0.81436777, "num_input_tokens_seen": 219992752, "router_z_loss_mlp": 0.11279297, "routerloss_mlp": 0.0, "step": 2640, "time_per_iteration": 2.596771240234375 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01010253, "balance_loss_mlp": 1.00422084, "diversity_loss_mlp": 0.0, "epoch": 0.5080800307810697, "flos": 1584224428032.0, "grad_norm": 0.011737851482073082, "language_loss": 0.78731823, "learning_rate": 0.000511214582663537, "loss": 0.79742074, "num_input_tokens_seen": 220224160, "router_z_loss_mlp": 0.06030273, "routerloss_mlp": 0.0, "step": 2641, "time_per_iteration": 4.948842287063599 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075403, "balance_loss_mlp": 1.06412029, "diversity_loss_mlp": 0.0, "epoch": 0.5082724124663333, "flos": 485209502208.0, "grad_norm": 0.08031663653823312, "language_loss": 0.8740893, "learning_rate": 0.0005109031165700483, "loss": 0.88484335, "num_input_tokens_seen": 220289504, "router_z_loss_mlp": 0.112854, "routerloss_mlp": 0.0, "step": 2642, "time_per_iteration": 2.5833895206451416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060876, "balance_loss_mlp": 1.04938459, "diversity_loss_mlp": 0.0, "epoch": 0.5084647941515967, "flos": 682230366720.0, "grad_norm": 0.06372027514248847, "language_loss": 0.83170295, "learning_rate": 0.0005105916462435945, "loss": 0.84231174, "num_input_tokens_seen": 220361376, "router_z_loss_mlp": 0.1149292, "routerloss_mlp": 0.0, "step": 2643, "time_per_iteration": 2.841296911239624 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106144, "balance_loss_mlp": 1.05014455, "diversity_loss_mlp": 0.0, "epoch": 0.5086571758368603, "flos": 548736768000.0, "grad_norm": 0.0681709540800111, "language_loss": 0.85266602, "learning_rate": 0.0005102801718050989, "loss": 0.86328042, "num_input_tokens_seen": 220434720, "router_z_loss_mlp": 0.11291504, "routerloss_mlp": 0.0, "step": 2644, "time_per_iteration": 2.680905818939209 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058539, "balance_loss_mlp": 1.04714894, "diversity_loss_mlp": 0.0, "epoch": 0.5088495575221239, "flos": 564016379904.0, "grad_norm": 0.07434027721258654, "language_loss": 0.89314902, "learning_rate": 0.0005099686933754867, "loss": 0.90373439, "num_input_tokens_seen": 220506208, "router_z_loss_mlp": 0.1138916, "routerloss_mlp": 0.0, "step": 2645, "time_per_iteration": 2.723043441772461 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062253, "balance_loss_mlp": 1.05088663, "diversity_loss_mlp": 0.0, "epoch": 0.5090419392073875, "flos": 551407689216.0, "grad_norm": 0.07256046334666034, "language_loss": 0.8429243, "learning_rate": 0.0005096572110756845, "loss": 0.85354686, "num_input_tokens_seen": 220577456, "router_z_loss_mlp": 0.11358643, "routerloss_mlp": 0.0, "step": 2646, "time_per_iteration": 2.6682143211364746 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069615, "balance_loss_mlp": 1.05801558, "diversity_loss_mlp": 0.0, "epoch": 0.509234320892651, "flos": 567779383296.0, "grad_norm": 0.06200075514200526, "language_loss": 0.85445803, "learning_rate": 0.0005093457250266205, "loss": 0.86515421, "num_input_tokens_seen": 220649648, "router_z_loss_mlp": 0.11584473, "routerloss_mlp": 0.0, "step": 2647, "time_per_iteration": 2.682891368865967 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069794, "balance_loss_mlp": 1.05816472, "diversity_loss_mlp": 0.0, "epoch": 0.5094267025779146, "flos": 582609314304.0, "grad_norm": 0.1092618136395953, "language_loss": 0.83279526, "learning_rate": 0.000509034235349224, "loss": 0.84349322, "num_input_tokens_seen": 220721168, "router_z_loss_mlp": 0.11627197, "routerloss_mlp": 0.0, "step": 2648, "time_per_iteration": 2.7173004150390625 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068823, "balance_loss_mlp": 1.05756938, "diversity_loss_mlp": 0.0, "epoch": 0.5096190842631781, "flos": 591990944256.0, "grad_norm": 0.07759183255272654, "language_loss": 0.81290972, "learning_rate": 0.0005087227421644266, "loss": 0.82359791, "num_input_tokens_seen": 220796464, "router_z_loss_mlp": 0.11248779, "routerloss_mlp": 0.0, "step": 2649, "time_per_iteration": 2.79217791557312 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066501, "balance_loss_mlp": 1.05469334, "diversity_loss_mlp": 0.0, "epoch": 0.5098114659484417, "flos": 513562166784.0, "grad_norm": 0.07036579944312285, "language_loss": 0.85978615, "learning_rate": 0.0005084112455931602, "loss": 0.87045121, "num_input_tokens_seen": 220862976, "router_z_loss_mlp": 0.11798096, "routerloss_mlp": 0.0, "step": 2650, "time_per_iteration": 2.593323230743408 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107287, "balance_loss_mlp": 1.06125915, "diversity_loss_mlp": 0.0, "epoch": 0.5100038476337053, "flos": 484631341056.0, "grad_norm": 0.06673546987966349, "language_loss": 0.85377133, "learning_rate": 0.0005080997457563586, "loss": 0.86449993, "num_input_tokens_seen": 220926432, "router_z_loss_mlp": 0.11608887, "routerloss_mlp": 0.0, "step": 2651, "time_per_iteration": 2.5473101139068604 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074592, "balance_loss_mlp": 1.06324303, "diversity_loss_mlp": 0.0, "epoch": 0.5101962293189688, "flos": 461603266560.0, "grad_norm": 0.07839929831674766, "language_loss": 0.79146206, "learning_rate": 0.0005077882427749569, "loss": 0.80220807, "num_input_tokens_seen": 220993008, "router_z_loss_mlp": 0.11340332, "routerloss_mlp": 0.0, "step": 2652, "time_per_iteration": 2.5378577709198 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081444, "balance_loss_mlp": 1.07002354, "diversity_loss_mlp": 0.0, "epoch": 0.5103886110042324, "flos": 587034542592.0, "grad_norm": 0.09222135648623411, "language_loss": 0.84599656, "learning_rate": 0.0005074767367698913, "loss": 0.85681099, "num_input_tokens_seen": 221059248, "router_z_loss_mlp": 0.11407471, "routerloss_mlp": 0.0, "step": 2653, "time_per_iteration": 2.7541823387145996 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01086598, "balance_loss_mlp": 1.0749042, "diversity_loss_mlp": 0.0, "epoch": 0.510580992689496, "flos": 845260116480.0, "grad_norm": 0.07250262260433718, "language_loss": 0.82987714, "learning_rate": 0.0005071652278620988, "loss": 0.84074312, "num_input_tokens_seen": 221133712, "router_z_loss_mlp": 0.11688232, "routerloss_mlp": 0.0, "step": 2654, "time_per_iteration": 3.0615251064300537 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089781, "balance_loss_mlp": 1.07870018, "diversity_loss_mlp": 0.0, "epoch": 0.5107733743747596, "flos": 658624131072.0, "grad_norm": 0.07582936293709001, "language_loss": 0.83328903, "learning_rate": 0.0005068537161725186, "loss": 0.84418684, "num_input_tokens_seen": 221202192, "router_z_loss_mlp": 0.11083984, "routerloss_mlp": 0.0, "step": 2655, "time_per_iteration": 2.7840993404388428 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092544, "balance_loss_mlp": 1.08139753, "diversity_loss_mlp": 0.0, "epoch": 0.510965756060023, "flos": 701732574720.0, "grad_norm": 0.07786356346883126, "language_loss": 0.84288549, "learning_rate": 0.0005065422018220893, "loss": 0.85381097, "num_input_tokens_seen": 221277104, "router_z_loss_mlp": 0.1114502, "routerloss_mlp": 0.0, "step": 2656, "time_per_iteration": 2.832575798034668 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01102102, "balance_loss_mlp": 1.09118247, "diversity_loss_mlp": 0.0, "epoch": 0.5111581377452866, "flos": 559731741696.0, "grad_norm": 0.08194812181942494, "language_loss": 0.80392313, "learning_rate": 0.0005062306849317521, "loss": 0.81494415, "num_input_tokens_seen": 221352320, "router_z_loss_mlp": 0.10931396, "routerloss_mlp": 0.0, "step": 2657, "time_per_iteration": 2.794966220855713 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100168, "balance_loss_mlp": 1.08891487, "diversity_loss_mlp": 0.0, "epoch": 0.5113505194305502, "flos": 609024863232.0, "grad_norm": 0.08210850574888065, "language_loss": 0.83486134, "learning_rate": 0.0005059191656224487, "loss": 0.84586298, "num_input_tokens_seen": 221421056, "router_z_loss_mlp": 0.11254883, "routerloss_mlp": 0.0, "step": 2658, "time_per_iteration": 2.744889736175537 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01093286, "balance_loss_mlp": 1.08238411, "diversity_loss_mlp": 0.0, "epoch": 0.5115429011158138, "flos": 534477657600.0, "grad_norm": 0.07321009008554179, "language_loss": 0.88860798, "learning_rate": 0.0005056076440151212, "loss": 0.89954078, "num_input_tokens_seen": 221492064, "router_z_loss_mlp": 0.10906982, "routerloss_mlp": 0.0, "step": 2659, "time_per_iteration": 2.6951825618743896 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0113007, "balance_loss_mlp": 1.12453902, "diversity_loss_mlp": 0.0, "epoch": 0.5117352828010774, "flos": 1362213780480.0, "grad_norm": 0.07076104465295206, "language_loss": 0.76288116, "learning_rate": 0.0005052961202307133, "loss": 0.77418184, "num_input_tokens_seen": 221724672, "router_z_loss_mlp": 0.05541992, "routerloss_mlp": 0.0, "step": 2660, "time_per_iteration": 4.850585460662842 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081367, "balance_loss_mlp": 1.07051301, "diversity_loss_mlp": 0.0, "epoch": 0.5119276644863409, "flos": 633740433408.0, "grad_norm": 0.06225287802871053, "language_loss": 0.86966121, "learning_rate": 0.0005049845943901691, "loss": 0.88047487, "num_input_tokens_seen": 221800144, "router_z_loss_mlp": 0.10864258, "routerloss_mlp": 0.0, "step": 2661, "time_per_iteration": 2.8342370986938477 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079888, "balance_loss_mlp": 1.0692786, "diversity_loss_mlp": 0.0, "epoch": 0.5121200461716044, "flos": 585598864896.0, "grad_norm": 0.058043198592839004, "language_loss": 0.86637139, "learning_rate": 0.0005046730666144338, "loss": 0.87717032, "num_input_tokens_seen": 221877168, "router_z_loss_mlp": 0.10620117, "routerloss_mlp": 0.0, "step": 2662, "time_per_iteration": 2.8066177368164062 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078601, "balance_loss_mlp": 1.06801558, "diversity_loss_mlp": 0.0, "epoch": 0.512312427856868, "flos": 1032508767744.0, "grad_norm": 0.058701328600128284, "language_loss": 0.87834954, "learning_rate": 0.0005043615370244532, "loss": 0.88913548, "num_input_tokens_seen": 221964208, "router_z_loss_mlp": 0.10595703, "routerloss_mlp": 0.0, "step": 2663, "time_per_iteration": 3.3716113567352295 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105143, "balance_loss_mlp": 1.04589903, "diversity_loss_mlp": 0.0, "epoch": 0.5125048095421316, "flos": 1537983645696.0, "grad_norm": 0.02890820887526385, "language_loss": 0.78244388, "learning_rate": 0.0005040500057411736, "loss": 0.79295814, "num_input_tokens_seen": 222179264, "router_z_loss_mlp": 0.05541992, "routerloss_mlp": 0.0, "step": 2664, "time_per_iteration": 4.632098913192749 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074303, "balance_loss_mlp": 1.0636878, "diversity_loss_mlp": 0.0, "epoch": 0.5126971912273951, "flos": 591116175360.0, "grad_norm": 0.05776678043634197, "language_loss": 0.85301316, "learning_rate": 0.0005037384728855425, "loss": 0.86375624, "num_input_tokens_seen": 222259504, "router_z_loss_mlp": 0.10620117, "routerloss_mlp": 0.0, "step": 2665, "time_per_iteration": 2.8025074005126953 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077204, "balance_loss_mlp": 1.06618285, "diversity_loss_mlp": 0.0, "epoch": 0.5128895729126587, "flos": 551657309184.0, "grad_norm": 0.08001364709617295, "language_loss": 0.84092522, "learning_rate": 0.0005034269385785075, "loss": 0.85169727, "num_input_tokens_seen": 222330512, "router_z_loss_mlp": 0.11022949, "routerloss_mlp": 0.0, "step": 2666, "time_per_iteration": 2.6508989334106445 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070249, "balance_loss_mlp": 1.05929327, "diversity_loss_mlp": 0.0, "epoch": 0.5130819545979223, "flos": 481271030784.0, "grad_norm": 0.06550806602425656, "language_loss": 0.849998, "learning_rate": 0.0005031154029410168, "loss": 0.86070049, "num_input_tokens_seen": 222394000, "router_z_loss_mlp": 0.10955811, "routerloss_mlp": 0.0, "step": 2667, "time_per_iteration": 2.6072959899902344 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062599, "balance_loss_mlp": 1.05130351, "diversity_loss_mlp": 0.0, "epoch": 0.5132743362831859, "flos": 475798136832.0, "grad_norm": 0.07261202613887993, "language_loss": 0.86903906, "learning_rate": 0.0005028038660940197, "loss": 0.87966514, "num_input_tokens_seen": 222459344, "router_z_loss_mlp": 0.11291504, "routerloss_mlp": 0.0, "step": 2668, "time_per_iteration": 2.5607664585113525 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060608, "balance_loss_mlp": 1.04923522, "diversity_loss_mlp": 0.0, "epoch": 0.5134667179684494, "flos": 503827029504.0, "grad_norm": 0.06521290367629204, "language_loss": 0.84553415, "learning_rate": 0.0005024923281584648, "loss": 0.8561402, "num_input_tokens_seen": 222528912, "router_z_loss_mlp": 0.11376953, "routerloss_mlp": 0.0, "step": 2669, "time_per_iteration": 2.623643159866333 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066692, "balance_loss_mlp": 1.05528402, "diversity_loss_mlp": 0.0, "epoch": 0.5136590996537129, "flos": 503918433792.0, "grad_norm": 0.06549707374857121, "language_loss": 0.82560658, "learning_rate": 0.0005021807892553026, "loss": 0.83627355, "num_input_tokens_seen": 222604704, "router_z_loss_mlp": 0.11413574, "routerloss_mlp": 0.0, "step": 2670, "time_per_iteration": 2.699392318725586 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062757, "balance_loss_mlp": 1.05140269, "diversity_loss_mlp": 0.0, "epoch": 0.5138514813389765, "flos": 624623104512.0, "grad_norm": 0.07318428846825417, "language_loss": 0.84862608, "learning_rate": 0.0005018692495054828, "loss": 0.85925364, "num_input_tokens_seen": 222677888, "router_z_loss_mlp": 0.11358643, "routerloss_mlp": 0.0, "step": 2671, "time_per_iteration": 2.7645046710968018 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106912, "balance_loss_mlp": 1.05812323, "diversity_loss_mlp": 0.0, "epoch": 0.5140438630242401, "flos": 583545752064.0, "grad_norm": 0.06397327244364565, "language_loss": 0.80696338, "learning_rate": 0.0005015577090299561, "loss": 0.81765461, "num_input_tokens_seen": 222751936, "router_z_loss_mlp": 0.11004639, "routerloss_mlp": 0.0, "step": 2672, "time_per_iteration": 2.684048891067505 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068328, "balance_loss_mlp": 1.05731261, "diversity_loss_mlp": 0.0, "epoch": 0.5142362447095037, "flos": 487927411200.0, "grad_norm": 0.06574977800170037, "language_loss": 0.86744952, "learning_rate": 0.0005012461679496729, "loss": 0.87813282, "num_input_tokens_seen": 222819616, "router_z_loss_mlp": 0.11022949, "routerloss_mlp": 0.0, "step": 2673, "time_per_iteration": 2.5885825157165527 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077367, "balance_loss_mlp": 1.06613708, "diversity_loss_mlp": 0.0, "epoch": 0.5144286263947672, "flos": 526857675264.0, "grad_norm": 0.09032594792095527, "language_loss": 0.87748468, "learning_rate": 0.0005009346263855848, "loss": 0.88825834, "num_input_tokens_seen": 222888448, "router_z_loss_mlp": 0.11236572, "routerloss_mlp": 0.0, "step": 2674, "time_per_iteration": 2.5970752239227295 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092275, "balance_loss_mlp": 1.08141518, "diversity_loss_mlp": 0.0, "epoch": 0.5146210080800308, "flos": 486518897664.0, "grad_norm": 0.06465969942237398, "language_loss": 0.83699256, "learning_rate": 0.0005006230844586422, "loss": 0.84791529, "num_input_tokens_seen": 222964736, "router_z_loss_mlp": 0.10858154, "routerloss_mlp": 0.0, "step": 2675, "time_per_iteration": 2.7912445068359375 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00882234, "balance_loss_mlp": 1.52600026, "diversity_loss_mlp": 0.21199086, "epoch": 0.5148133897652943, "flos": 515892063744.0, "grad_norm": 0.0263651655655577, "language_loss": 0.78895926, "learning_rate": 0.0005003115422897968, "loss": 0.79778159, "num_input_tokens_seen": 223040944, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01323896, "step": 2676, "time_per_iteration": 2.8051552772521973 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111408, "balance_loss_mlp": 1.10282683, "diversity_loss_mlp": 0.0, "epoch": 0.5150057714505579, "flos": 511212446208.0, "grad_norm": 0.0741463219638638, "language_loss": 0.87253916, "learning_rate": 0.0005, "loss": 0.88367999, "num_input_tokens_seen": 223109632, "router_z_loss_mlp": 0.11254883, "routerloss_mlp": 0.0, "step": 2677, "time_per_iteration": 2.6435391902923584 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119404, "balance_loss_mlp": 1.10841274, "diversity_loss_mlp": 0.0, "epoch": 0.5151981531358215, "flos": 910909877760.0, "grad_norm": 0.08792863943872284, "language_loss": 0.79283178, "learning_rate": 0.0004996884577102033, "loss": 0.80402583, "num_input_tokens_seen": 223191648, "router_z_loss_mlp": 0.10992432, "routerloss_mlp": 0.0, "step": 2678, "time_per_iteration": 3.089707374572754 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111646, "balance_loss_mlp": 1.10545659, "diversity_loss_mlp": 0.0, "epoch": 0.515390534821085, "flos": 471864434688.0, "grad_norm": 0.08112886088857633, "language_loss": 0.84611261, "learning_rate": 0.000499376915541358, "loss": 0.85727721, "num_input_tokens_seen": 223265920, "router_z_loss_mlp": 0.10998535, "routerloss_mlp": 0.0, "step": 2679, "time_per_iteration": 2.7143540382385254 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109969, "balance_loss_mlp": 1.08910465, "diversity_loss_mlp": 0.0, "epoch": 0.5155829165063486, "flos": 650119468032.0, "grad_norm": 0.16255458440641746, "language_loss": 0.81113428, "learning_rate": 0.0004990653736144155, "loss": 0.82213122, "num_input_tokens_seen": 223340688, "router_z_loss_mlp": 0.10595703, "routerloss_mlp": 0.0, "step": 2680, "time_per_iteration": 2.857952356338501 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084916, "balance_loss_mlp": 1.07416916, "diversity_loss_mlp": 0.0, "epoch": 0.5157752981916122, "flos": 414262315008.0, "grad_norm": 0.06912387000686389, "language_loss": 0.85820174, "learning_rate": 0.0004987538320503271, "loss": 0.86905092, "num_input_tokens_seen": 223404064, "router_z_loss_mlp": 0.10748291, "routerloss_mlp": 0.0, "step": 2681, "time_per_iteration": 2.485462188720703 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077984, "balance_loss_mlp": 1.06715369, "diversity_loss_mlp": 0.0, "epoch": 0.5159676798768758, "flos": 553841473536.0, "grad_norm": 0.08121908376237164, "language_loss": 0.83137929, "learning_rate": 0.0004984422909700442, "loss": 0.84215909, "num_input_tokens_seen": 223476784, "router_z_loss_mlp": 0.10845947, "routerloss_mlp": 0.0, "step": 2682, "time_per_iteration": 2.7179505825042725 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068711, "balance_loss_mlp": 1.05784559, "diversity_loss_mlp": 0.0, "epoch": 0.5161600615621393, "flos": 586510709760.0, "grad_norm": 0.07829442771548371, "language_loss": 0.83800036, "learning_rate": 0.0004981307504945173, "loss": 0.84868753, "num_input_tokens_seen": 223542832, "router_z_loss_mlp": 0.10876465, "routerloss_mlp": 0.0, "step": 2683, "time_per_iteration": 2.71893048286438 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061815, "balance_loss_mlp": 1.05075228, "diversity_loss_mlp": 0.0, "epoch": 0.5163524432474028, "flos": 588843177984.0, "grad_norm": 0.08619577510477876, "language_loss": 0.89448887, "learning_rate": 0.0004978192107446976, "loss": 0.90510702, "num_input_tokens_seen": 223617968, "router_z_loss_mlp": 0.11071777, "routerloss_mlp": 0.0, "step": 2684, "time_per_iteration": 2.7385506629943848 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062179, "balance_loss_mlp": 1.05111599, "diversity_loss_mlp": 0.0, "epoch": 0.5165448249326664, "flos": 503893840896.0, "grad_norm": 0.08129158019501125, "language_loss": 0.8740204, "learning_rate": 0.0004975076718415353, "loss": 0.88464212, "num_input_tokens_seen": 223689504, "router_z_loss_mlp": 0.11077881, "routerloss_mlp": 0.0, "step": 2685, "time_per_iteration": 2.599379777908325 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055804, "balance_loss_mlp": 1.04478931, "diversity_loss_mlp": 0.0, "epoch": 0.51673720661793, "flos": 416760339456.0, "grad_norm": 0.06772474949474022, "language_loss": 0.90610582, "learning_rate": 0.0004971961339059806, "loss": 0.91666389, "num_input_tokens_seen": 223752288, "router_z_loss_mlp": 0.11016846, "routerloss_mlp": 0.0, "step": 2686, "time_per_iteration": 2.498819589614868 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057473, "balance_loss_mlp": 1.04611838, "diversity_loss_mlp": 0.0, "epoch": 0.5169295883031936, "flos": 598971096576.0, "grad_norm": 0.06487308694775892, "language_loss": 0.84021914, "learning_rate": 0.0004968845970589832, "loss": 0.85079384, "num_input_tokens_seen": 223822304, "router_z_loss_mlp": 0.11358643, "routerloss_mlp": 0.0, "step": 2687, "time_per_iteration": 2.6814825534820557 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061325, "balance_loss_mlp": 1.04982185, "diversity_loss_mlp": 0.0, "epoch": 0.517121969988457, "flos": 556816343040.0, "grad_norm": 0.06911328459433905, "language_loss": 0.8435297, "learning_rate": 0.0004965730614214926, "loss": 0.8541429, "num_input_tokens_seen": 223888592, "router_z_loss_mlp": 0.11499023, "routerloss_mlp": 0.0, "step": 2688, "time_per_iteration": 2.6537294387817383 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106762, "balance_loss_mlp": 1.05618167, "diversity_loss_mlp": 0.0, "epoch": 0.5173143516737206, "flos": 469445704704.0, "grad_norm": 0.07039148040030412, "language_loss": 0.85285878, "learning_rate": 0.0004962615271144576, "loss": 0.86353499, "num_input_tokens_seen": 223952880, "router_z_loss_mlp": 0.11431885, "routerloss_mlp": 0.0, "step": 2689, "time_per_iteration": 2.50710129737854 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064714, "balance_loss_mlp": 1.05325246, "diversity_loss_mlp": 0.0, "epoch": 0.5175067333589842, "flos": 720065977344.0, "grad_norm": 0.0770213433091723, "language_loss": 0.82680881, "learning_rate": 0.0004959499942588264, "loss": 0.83745599, "num_input_tokens_seen": 224030000, "router_z_loss_mlp": 0.11456299, "routerloss_mlp": 0.0, "step": 2690, "time_per_iteration": 2.892293930053711 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049886, "balance_loss_mlp": 1.04297149, "diversity_loss_mlp": 0.0, "epoch": 0.5176991150442478, "flos": 1466188480512.0, "grad_norm": 0.03551055813206397, "language_loss": 0.78200024, "learning_rate": 0.0004956384629755469, "loss": 0.79249913, "num_input_tokens_seen": 224252384, "router_z_loss_mlp": 0.06933594, "routerloss_mlp": 0.0, "step": 2691, "time_per_iteration": 4.764665842056274 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070219, "balance_loss_mlp": 1.05894208, "diversity_loss_mlp": 0.0, "epoch": 0.5178914967295114, "flos": 612632222208.0, "grad_norm": 0.08037192658361764, "language_loss": 0.85416174, "learning_rate": 0.0004953269333855661, "loss": 0.86486399, "num_input_tokens_seen": 224324640, "router_z_loss_mlp": 0.11273193, "routerloss_mlp": 0.0, "step": 2692, "time_per_iteration": 2.785511016845703 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075514, "balance_loss_mlp": 1.06407034, "diversity_loss_mlp": 0.0, "epoch": 0.5180838784147749, "flos": 500926311936.0, "grad_norm": 0.06114385406953633, "language_loss": 0.84516799, "learning_rate": 0.0004950154056098309, "loss": 0.85592318, "num_input_tokens_seen": 224398368, "router_z_loss_mlp": 0.11437988, "routerloss_mlp": 0.0, "step": 2693, "time_per_iteration": 2.683246374130249 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083334, "balance_loss_mlp": 1.07183599, "diversity_loss_mlp": 0.0, "epoch": 0.5182762601000385, "flos": 688832418816.0, "grad_norm": 0.08066804074186672, "language_loss": 0.84078431, "learning_rate": 0.0004947038797692867, "loss": 0.85161769, "num_input_tokens_seen": 224465456, "router_z_loss_mlp": 0.1149292, "routerloss_mlp": 0.0, "step": 2694, "time_per_iteration": 2.8312196731567383 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00872465, "balance_loss_mlp": 1.50766385, "diversity_loss_mlp": 0.2097543, "epoch": 0.518468641785302, "flos": 665611623936.0, "grad_norm": 0.031552182630998016, "language_loss": 0.77636528, "learning_rate": 0.0004943923559848789, "loss": 0.78508997, "num_input_tokens_seen": 224540960, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01375636, "step": 2695, "time_per_iteration": 2.8084189891815186 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010865, "balance_loss_mlp": 1.07534158, "diversity_loss_mlp": 0.0, "epoch": 0.5186610234705656, "flos": 566714465280.0, "grad_norm": 0.055486891719670514, "language_loss": 0.90695632, "learning_rate": 0.0004940808343775515, "loss": 0.91782129, "num_input_tokens_seen": 224613200, "router_z_loss_mlp": 0.1116333, "routerloss_mlp": 0.0, "step": 2696, "time_per_iteration": 2.6868011951446533 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00874209, "balance_loss_mlp": 1.50797677, "diversity_loss_mlp": 0.21290711, "epoch": 0.5188534051558291, "flos": 428879702016.0, "grad_norm": 0.034010170020107075, "language_loss": 0.82213199, "learning_rate": 0.0004937693150682479, "loss": 0.83087409, "num_input_tokens_seen": 224677456, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01376703, "step": 2697, "time_per_iteration": 2.5905513763427734 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090314, "balance_loss_mlp": 1.07915568, "diversity_loss_mlp": 0.0, "epoch": 0.5190457868410927, "flos": 546349971456.0, "grad_norm": 0.06705206433038317, "language_loss": 0.7658723, "learning_rate": 0.0004934577981779107, "loss": 0.77677542, "num_input_tokens_seen": 224745600, "router_z_loss_mlp": 0.1116333, "routerloss_mlp": 0.0, "step": 2698, "time_per_iteration": 2.7049057483673096 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087095, "balance_loss_mlp": 1.07585335, "diversity_loss_mlp": 0.0, "epoch": 0.5192381685263563, "flos": 548605716480.0, "grad_norm": 0.061529133753451364, "language_loss": 0.812904, "learning_rate": 0.0004931462838274817, "loss": 0.82377493, "num_input_tokens_seen": 224826944, "router_z_loss_mlp": 0.11242676, "routerloss_mlp": 0.0, "step": 2699, "time_per_iteration": 2.8723175525665283 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089813, "balance_loss_mlp": 1.07877994, "diversity_loss_mlp": 0.0, "epoch": 0.5194305502116199, "flos": 575263544832.0, "grad_norm": 0.08487292742433496, "language_loss": 0.84222901, "learning_rate": 0.0004928347721379011, "loss": 0.85312712, "num_input_tokens_seen": 224895280, "router_z_loss_mlp": 0.11035156, "routerloss_mlp": 0.0, "step": 2700, "time_per_iteration": 2.639867067337036 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080545, "balance_loss_mlp": 1.06974459, "diversity_loss_mlp": 0.0, "epoch": 0.5196229318968835, "flos": 434258620416.0, "grad_norm": 0.06134037245316137, "language_loss": 0.82221866, "learning_rate": 0.0004925232632301089, "loss": 0.83302414, "num_input_tokens_seen": 224961632, "router_z_loss_mlp": 0.10797119, "routerloss_mlp": 0.0, "step": 2701, "time_per_iteration": 2.622311592102051 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077123, "balance_loss_mlp": 1.0660243, "diversity_loss_mlp": 0.0, "epoch": 0.5198153135821469, "flos": 558881938944.0, "grad_norm": 0.06337758152829237, "language_loss": 0.79842103, "learning_rate": 0.0004922117572250431, "loss": 0.80919224, "num_input_tokens_seen": 225032816, "router_z_loss_mlp": 0.11096191, "routerloss_mlp": 0.0, "step": 2702, "time_per_iteration": 2.6980605125427246 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070723, "balance_loss_mlp": 1.05936241, "diversity_loss_mlp": 0.0, "epoch": 0.5200076952674105, "flos": 565684051968.0, "grad_norm": 0.07398400160993446, "language_loss": 0.80852163, "learning_rate": 0.0004919002542436414, "loss": 0.81922889, "num_input_tokens_seen": 225112736, "router_z_loss_mlp": 0.11358643, "routerloss_mlp": 0.0, "step": 2703, "time_per_iteration": 2.8354647159576416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072853, "balance_loss_mlp": 1.0619514, "diversity_loss_mlp": 0.0, "epoch": 0.5202000769526741, "flos": 571186681344.0, "grad_norm": 0.064542502306726, "language_loss": 0.8126899, "learning_rate": 0.0004915887544068399, "loss": 0.8234185, "num_input_tokens_seen": 225182672, "router_z_loss_mlp": 0.10906982, "routerloss_mlp": 0.0, "step": 2704, "time_per_iteration": 2.6693973541259766 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068776, "balance_loss_mlp": 1.05770195, "diversity_loss_mlp": 0.0, "epoch": 0.5203924586379377, "flos": 694211337216.0, "grad_norm": 0.06578360362401801, "language_loss": 0.7856639, "learning_rate": 0.0004912772578355736, "loss": 0.79635167, "num_input_tokens_seen": 225260272, "router_z_loss_mlp": 0.11071777, "routerloss_mlp": 0.0, "step": 2705, "time_per_iteration": 2.892735481262207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107232, "balance_loss_mlp": 1.0611918, "diversity_loss_mlp": 0.0, "epoch": 0.5205848403232012, "flos": 566509261824.0, "grad_norm": 0.07750798967783011, "language_loss": 0.82549465, "learning_rate": 0.000490965764650776, "loss": 0.83621788, "num_input_tokens_seen": 225337120, "router_z_loss_mlp": 0.11126709, "routerloss_mlp": 0.0, "step": 2706, "time_per_iteration": 2.8544106483459473 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070979, "balance_loss_mlp": 1.05984521, "diversity_loss_mlp": 0.0, "epoch": 0.5207772220084648, "flos": 1214259932160.0, "grad_norm": 0.06572065456776559, "language_loss": 0.82828736, "learning_rate": 0.0004906542749733798, "loss": 0.83899713, "num_input_tokens_seen": 225433984, "router_z_loss_mlp": 0.11132812, "routerloss_mlp": 0.0, "step": 2707, "time_per_iteration": 3.6044294834136963 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107442, "balance_loss_mlp": 1.06353068, "diversity_loss_mlp": 0.0, "epoch": 0.5209696036937284, "flos": 592843318272.0, "grad_norm": 0.055629683487612144, "language_loss": 0.85401118, "learning_rate": 0.0004903427889243156, "loss": 0.86475539, "num_input_tokens_seen": 225512112, "router_z_loss_mlp": 0.10894775, "routerloss_mlp": 0.0, "step": 2708, "time_per_iteration": 2.830115795135498 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075334, "balance_loss_mlp": 1.06425905, "diversity_loss_mlp": 0.0, "epoch": 0.5211619853789919, "flos": 522889468416.0, "grad_norm": 0.06692681375903406, "language_loss": 0.85444081, "learning_rate": 0.0004900313066245134, "loss": 0.86519414, "num_input_tokens_seen": 225586944, "router_z_loss_mlp": 0.11077881, "routerloss_mlp": 0.0, "step": 2709, "time_per_iteration": 2.6552441120147705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106839, "balance_loss_mlp": 1.05745232, "diversity_loss_mlp": 0.0, "epoch": 0.5213543670642555, "flos": 502799187456.0, "grad_norm": 0.06855502771674758, "language_loss": 0.81061214, "learning_rate": 0.0004897198281949012, "loss": 0.82129598, "num_input_tokens_seen": 225657184, "router_z_loss_mlp": 0.10949707, "routerloss_mlp": 0.0, "step": 2710, "time_per_iteration": 2.645981550216675 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00874972, "balance_loss_mlp": 1.51124442, "diversity_loss_mlp": 0.21021394, "epoch": 0.521546748749519, "flos": 585959712768.0, "grad_norm": 0.03577466895356274, "language_loss": 0.78009295, "learning_rate": 0.0004894083537564057, "loss": 0.78884268, "num_input_tokens_seen": 225729968, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01424256, "step": 2711, "time_per_iteration": 2.746945858001709 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0086804, "balance_loss_mlp": 1.49602354, "diversity_loss_mlp": 0.21089339, "epoch": 0.5217391304347826, "flos": 570119192064.0, "grad_norm": 0.02967241377466632, "language_loss": 0.80981171, "learning_rate": 0.0004890968834299519, "loss": 0.81849211, "num_input_tokens_seen": 225801808, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01458106, "step": 2712, "time_per_iteration": 2.749049663543701 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072348, "balance_loss_mlp": 1.06096959, "diversity_loss_mlp": 0.0, "epoch": 0.5219315121200462, "flos": 542784457728.0, "grad_norm": 0.06422523073894505, "language_loss": 0.78739542, "learning_rate": 0.0004887854173364633, "loss": 0.79811883, "num_input_tokens_seen": 225878576, "router_z_loss_mlp": 0.11364746, "routerloss_mlp": 0.0, "step": 2713, "time_per_iteration": 2.760077953338623 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00862336, "balance_loss_mlp": 1.48416615, "diversity_loss_mlp": 0.2112534, "epoch": 0.5221238938053098, "flos": 550310464512.0, "grad_norm": 0.02839704110509781, "language_loss": 0.81564224, "learning_rate": 0.0004884739555968617, "loss": 0.8242656, "num_input_tokens_seen": 225960096, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01462588, "step": 2714, "time_per_iteration": 2.902200698852539 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043267, "balance_loss_mlp": 1.03711605, "diversity_loss_mlp": 0.0, "epoch": 0.5223162754905732, "flos": 1355174157312.0, "grad_norm": 0.025188943281148922, "language_loss": 0.78977054, "learning_rate": 0.0004881624983320676, "loss": 0.8002032, "num_input_tokens_seen": 226184960, "router_z_loss_mlp": 0.06152344, "routerloss_mlp": 0.0, "step": 2715, "time_per_iteration": 4.977273464202881 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00847492, "balance_loss_mlp": 1.45660305, "diversity_loss_mlp": 0.21012819, "epoch": 0.5225086571758368, "flos": 567747076608.0, "grad_norm": 0.03573397478438407, "language_loss": 0.86888605, "learning_rate": 0.0004878510456629992, "loss": 0.87736094, "num_input_tokens_seen": 226271328, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01412619, "step": 2716, "time_per_iteration": 2.998455286026001 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068588, "balance_loss_mlp": 1.05767989, "diversity_loss_mlp": 0.0, "epoch": 0.5227010388611004, "flos": 500158001664.0, "grad_norm": 0.06765059094142209, "language_loss": 0.85142076, "learning_rate": 0.00048753959771057314, "loss": 0.86210662, "num_input_tokens_seen": 226340080, "router_z_loss_mlp": 0.10925293, "routerloss_mlp": 0.0, "step": 2717, "time_per_iteration": 2.6113662719726562 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065726, "balance_loss_mlp": 1.05442464, "diversity_loss_mlp": 0.0, "epoch": 0.522893420546364, "flos": 597656558592.0, "grad_norm": 0.08600503840688169, "language_loss": 0.82445514, "learning_rate": 0.0004872281545957044, "loss": 0.83511233, "num_input_tokens_seen": 226415120, "router_z_loss_mlp": 0.11297607, "routerloss_mlp": 0.0, "step": 2718, "time_per_iteration": 2.7617604732513428 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070418, "balance_loss_mlp": 1.05911732, "diversity_loss_mlp": 0.0, "epoch": 0.5230858022316276, "flos": 664605803520.0, "grad_norm": 0.061040572409093316, "language_loss": 0.86051857, "learning_rate": 0.0004869167164393055, "loss": 0.87122279, "num_input_tokens_seen": 226501200, "router_z_loss_mlp": 0.11303711, "routerloss_mlp": 0.0, "step": 2719, "time_per_iteration": 2.932154417037964 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069726, "balance_loss_mlp": 1.05857992, "diversity_loss_mlp": 0.0, "epoch": 0.5232781839168911, "flos": 603843434496.0, "grad_norm": 0.11614833297327579, "language_loss": 0.89542395, "learning_rate": 0.00048660528336228793, "loss": 0.90612125, "num_input_tokens_seen": 226582064, "router_z_loss_mlp": 0.11151123, "routerloss_mlp": 0.0, "step": 2720, "time_per_iteration": 2.7917380332946777 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071611, "balance_loss_mlp": 1.06013143, "diversity_loss_mlp": 0.0, "epoch": 0.5234705656021547, "flos": 550718300160.0, "grad_norm": 0.05730438157509479, "language_loss": 0.90177751, "learning_rate": 0.0004862938554855606, "loss": 0.91249359, "num_input_tokens_seen": 226656448, "router_z_loss_mlp": 0.11474609, "routerloss_mlp": 0.0, "step": 2721, "time_per_iteration": 2.809875965118408 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074661, "balance_loss_mlp": 1.06371188, "diversity_loss_mlp": 0.0, "epoch": 0.5236629472874182, "flos": 504279281664.0, "grad_norm": 0.06740042101514945, "language_loss": 0.86071771, "learning_rate": 0.0004859824329300304, "loss": 0.87146431, "num_input_tokens_seen": 226725568, "router_z_loss_mlp": 0.10949707, "routerloss_mlp": 0.0, "step": 2722, "time_per_iteration": 2.5660176277160645 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070887, "balance_loss_mlp": 1.05932951, "diversity_loss_mlp": 0.0, "epoch": 0.5238553289726818, "flos": 547654597632.0, "grad_norm": 0.06312939516717878, "language_loss": 0.83826602, "learning_rate": 0.00048567101581660244, "loss": 0.84897488, "num_input_tokens_seen": 226795728, "router_z_loss_mlp": 0.11560059, "routerloss_mlp": 0.0, "step": 2723, "time_per_iteration": 2.593005895614624 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107168, "balance_loss_mlp": 1.0603317, "diversity_loss_mlp": 0.0, "epoch": 0.5240477106579453, "flos": 531962380800.0, "grad_norm": 0.07171512526566694, "language_loss": 0.86622667, "learning_rate": 0.00048535960426617956, "loss": 0.87694347, "num_input_tokens_seen": 226865344, "router_z_loss_mlp": 0.11346436, "routerloss_mlp": 0.0, "step": 2724, "time_per_iteration": 2.611551523208618 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070313, "balance_loss_mlp": 1.05852962, "diversity_loss_mlp": 0.0, "epoch": 0.5242400923432089, "flos": 617939559936.0, "grad_norm": 0.07077799246948024, "language_loss": 0.81735158, "learning_rate": 0.0004850481983996621, "loss": 0.82805473, "num_input_tokens_seen": 226936800, "router_z_loss_mlp": 0.11767578, "routerloss_mlp": 0.0, "step": 2725, "time_per_iteration": 2.7656939029693604 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058325, "balance_loss_mlp": 1.04673731, "diversity_loss_mlp": 0.0, "epoch": 0.5244324740284725, "flos": 416686187520.0, "grad_norm": 0.07497614956550303, "language_loss": 0.87961793, "learning_rate": 0.0004847367983379492, "loss": 0.89020109, "num_input_tokens_seen": 226998448, "router_z_loss_mlp": 0.11578369, "routerloss_mlp": 0.0, "step": 2726, "time_per_iteration": 2.523099899291992 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066059, "balance_loss_mlp": 1.05477571, "diversity_loss_mlp": 0.0, "epoch": 0.5246248557137361, "flos": 626436509184.0, "grad_norm": 0.06275633211650163, "language_loss": 0.78715622, "learning_rate": 0.00048442540420193643, "loss": 0.79781681, "num_input_tokens_seen": 227081872, "router_z_loss_mlp": 0.11291504, "routerloss_mlp": 0.0, "step": 2727, "time_per_iteration": 2.9433038234710693 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056804, "balance_loss_mlp": 1.04506755, "diversity_loss_mlp": 0.0, "epoch": 0.5248172373989997, "flos": 1248463590912.0, "grad_norm": 0.07393634521455344, "language_loss": 0.79367208, "learning_rate": 0.0004841140161125182, "loss": 0.80424011, "num_input_tokens_seen": 227167744, "router_z_loss_mlp": 0.11730957, "routerloss_mlp": 0.0, "step": 2728, "time_per_iteration": 3.619252920150757 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063037, "balance_loss_mlp": 1.05171847, "diversity_loss_mlp": 0.0, "epoch": 0.5250096190842631, "flos": 506868710400.0, "grad_norm": 0.07165329358033216, "language_loss": 0.84827459, "learning_rate": 0.0004838026341905857, "loss": 0.85890496, "num_input_tokens_seen": 227239136, "router_z_loss_mlp": 0.11322021, "routerloss_mlp": 0.0, "step": 2729, "time_per_iteration": 2.716114044189453 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057536, "balance_loss_mlp": 1.04594862, "diversity_loss_mlp": 0.0, "epoch": 0.5252020007695267, "flos": 611317684224.0, "grad_norm": 0.13042739485624238, "language_loss": 0.85312545, "learning_rate": 0.00048349125855702844, "loss": 0.86370087, "num_input_tokens_seen": 227311968, "router_z_loss_mlp": 0.11572266, "routerloss_mlp": 0.0, "step": 2730, "time_per_iteration": 2.787280559539795 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00837258, "balance_loss_mlp": 1.43598437, "diversity_loss_mlp": 0.21135046, "epoch": 0.5253943824547903, "flos": 539233998336.0, "grad_norm": 0.027658523195400363, "language_loss": 0.81318069, "learning_rate": 0.00048317988933273287, "loss": 0.82155323, "num_input_tokens_seen": 227385248, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01359018, "step": 2731, "time_per_iteration": 2.763814687728882 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057346, "balance_loss_mlp": 1.04585993, "diversity_loss_mlp": 0.0, "epoch": 0.5255867641400539, "flos": 698038580736.0, "grad_norm": 0.07420390441928848, "language_loss": 0.82373381, "learning_rate": 0.00048286852663858367, "loss": 0.83430725, "num_input_tokens_seen": 227464640, "router_z_loss_mlp": 0.11480713, "routerloss_mlp": 0.0, "step": 2732, "time_per_iteration": 2.9533157348632812 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063655, "balance_loss_mlp": 1.05203819, "diversity_loss_mlp": 0.0, "epoch": 0.5257791458253175, "flos": 667289207808.0, "grad_norm": 0.07616653501098058, "language_loss": 0.8428973, "learning_rate": 0.000482557170595462, "loss": 0.8535338, "num_input_tokens_seen": 227542192, "router_z_loss_mlp": 0.11608887, "routerloss_mlp": 0.0, "step": 2733, "time_per_iteration": 2.865147829055786 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065104, "balance_loss_mlp": 1.0532366, "diversity_loss_mlp": 0.0, "epoch": 0.525971527510581, "flos": 483620751360.0, "grad_norm": 0.060395165010054055, "language_loss": 0.87880594, "learning_rate": 0.0004822458213242475, "loss": 0.88945693, "num_input_tokens_seen": 227606096, "router_z_loss_mlp": 0.11859131, "routerloss_mlp": 0.0, "step": 2734, "time_per_iteration": 2.557253360748291 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070633, "balance_loss_mlp": 1.05886698, "diversity_loss_mlp": 0.0, "epoch": 0.5261639091958445, "flos": 829916264448.0, "grad_norm": 0.1031910380133139, "language_loss": 0.86086309, "learning_rate": 0.00048193447894581627, "loss": 0.8715694, "num_input_tokens_seen": 227689552, "router_z_loss_mlp": 0.11761475, "routerloss_mlp": 0.0, "step": 2735, "time_per_iteration": 3.122976541519165 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076001, "balance_loss_mlp": 1.06436014, "diversity_loss_mlp": 0.0, "epoch": 0.5263562908811081, "flos": 520715215872.0, "grad_norm": 0.06843040001694842, "language_loss": 0.8809998, "learning_rate": 0.00048162314358104243, "loss": 0.89175981, "num_input_tokens_seen": 227760784, "router_z_loss_mlp": 0.11639404, "routerloss_mlp": 0.0, "step": 2736, "time_per_iteration": 2.6340246200561523 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00824973, "balance_loss_mlp": 1.41347969, "diversity_loss_mlp": 0.20989257, "epoch": 0.5265486725663717, "flos": 574996672512.0, "grad_norm": 0.031515925317837694, "language_loss": 0.83306372, "learning_rate": 0.0004813118153507969, "loss": 0.84131336, "num_input_tokens_seen": 227834304, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01328672, "step": 2737, "time_per_iteration": 2.7356157302856445 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01041199, "balance_loss_mlp": 1.03480983, "diversity_loss_mlp": 0.0, "epoch": 0.5267410542516352, "flos": 1547261015040.0, "grad_norm": 0.03217065957479051, "language_loss": 0.82447124, "learning_rate": 0.0004810004943759482, "loss": 0.83488321, "num_input_tokens_seen": 228057232, "router_z_loss_mlp": 0.06396484, "routerloss_mlp": 0.0, "step": 2738, "time_per_iteration": 4.772867202758789 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107199, "balance_loss_mlp": 1.06062317, "diversity_loss_mlp": 0.0, "epoch": 0.5269334359368988, "flos": 929952493056.0, "grad_norm": 0.0555866415390632, "language_loss": 0.83715498, "learning_rate": 0.00048068918077736163, "loss": 0.84787494, "num_input_tokens_seen": 228140816, "router_z_loss_mlp": 0.11358643, "routerloss_mlp": 0.0, "step": 2739, "time_per_iteration": 3.2028074264526367 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076914, "balance_loss_mlp": 1.06573176, "diversity_loss_mlp": 0.0, "epoch": 0.5271258176221624, "flos": 655389729792.0, "grad_norm": 0.06998122113459494, "language_loss": 0.81445146, "learning_rate": 0.0004803778746759001, "loss": 0.82522058, "num_input_tokens_seen": 228216208, "router_z_loss_mlp": 0.11181641, "routerloss_mlp": 0.0, "step": 2740, "time_per_iteration": 2.87070369720459 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082959, "balance_loss_mlp": 1.07215285, "diversity_loss_mlp": 0.0, "epoch": 0.527318199307426, "flos": 543036648960.0, "grad_norm": 0.07737040857299185, "language_loss": 0.82122779, "learning_rate": 0.00048006657619242317, "loss": 0.83205736, "num_input_tokens_seen": 228283184, "router_z_loss_mlp": 0.10809326, "routerloss_mlp": 0.0, "step": 2741, "time_per_iteration": 2.6385269165039062 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107519, "balance_loss_mlp": 1.06447887, "diversity_loss_mlp": 0.0, "epoch": 0.5275105809926895, "flos": 447882670080.0, "grad_norm": 0.07879516603511716, "language_loss": 0.78380877, "learning_rate": 0.00047975528544778775, "loss": 0.79456067, "num_input_tokens_seen": 228351328, "router_z_loss_mlp": 0.10717773, "routerloss_mlp": 0.0, "step": 2742, "time_per_iteration": 2.6197235584259033 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079206, "balance_loss_mlp": 1.06839335, "diversity_loss_mlp": 0.0, "epoch": 0.527702962677953, "flos": 578935143936.0, "grad_norm": 0.07439948679259917, "language_loss": 0.88816094, "learning_rate": 0.00047944400256284754, "loss": 0.89895302, "num_input_tokens_seen": 228423632, "router_z_loss_mlp": 0.10827637, "routerloss_mlp": 0.0, "step": 2743, "time_per_iteration": 2.6887855529785156 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00830459, "balance_loss_mlp": 1.42072511, "diversity_loss_mlp": 0.21262056, "epoch": 0.5278953443632166, "flos": 652773136896.0, "grad_norm": 0.03227823662204125, "language_loss": 0.799101, "learning_rate": 0.0004791327276584532, "loss": 0.80740565, "num_input_tokens_seen": 228498736, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01378582, "step": 2744, "time_per_iteration": 2.8497848510742188 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087072, "balance_loss_mlp": 1.07629538, "diversity_loss_mlp": 0.0, "epoch": 0.5280877260484802, "flos": 514001935872.0, "grad_norm": 0.0718535906247093, "language_loss": 0.80497956, "learning_rate": 0.00047882146085545264, "loss": 0.81585032, "num_input_tokens_seen": 228569056, "router_z_loss_mlp": 0.10784912, "routerloss_mlp": 0.0, "step": 2745, "time_per_iteration": 2.6078941822052 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01017458, "balance_loss_mlp": 1.01199865, "diversity_loss_mlp": 0.0, "epoch": 0.5282801077337438, "flos": 1445460567552.0, "grad_norm": 0.013176381696238814, "language_loss": 0.75402379, "learning_rate": 0.00047851020227469, "loss": 0.76419842, "num_input_tokens_seen": 228800560, "router_z_loss_mlp": 0.0546875, "routerloss_mlp": 0.0, "step": 2746, "time_per_iteration": 4.974900007247925 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078307, "balance_loss_mlp": 1.06777453, "diversity_loss_mlp": 0.0, "epoch": 0.5284724894190073, "flos": 604856595456.0, "grad_norm": 0.0894490118638191, "language_loss": 0.79344547, "learning_rate": 0.00047819895203700684, "loss": 0.80422854, "num_input_tokens_seen": 228869216, "router_z_loss_mlp": 0.10534668, "routerloss_mlp": 0.0, "step": 2747, "time_per_iteration": 2.717135190963745 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01015273, "balance_loss_mlp": 1.00983751, "diversity_loss_mlp": 0.0, "epoch": 0.5286648711042709, "flos": 1494956321280.0, "grad_norm": 0.009473538771460566, "language_loss": 0.75512433, "learning_rate": 0.0004778877102632412, "loss": 0.76527709, "num_input_tokens_seen": 229085520, "router_z_loss_mlp": 0.05444336, "routerloss_mlp": 0.0, "step": 2748, "time_per_iteration": 4.642770290374756 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085947, "balance_loss_mlp": 1.07577801, "diversity_loss_mlp": 0.0, "epoch": 0.5288572527895344, "flos": 597616911360.0, "grad_norm": 0.07060951554594143, "language_loss": 0.88469762, "learning_rate": 0.0004775764770742277, "loss": 0.89555711, "num_input_tokens_seen": 229160912, "router_z_loss_mlp": 0.10168457, "routerloss_mlp": 0.0, "step": 2749, "time_per_iteration": 2.8018476963043213 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087955, "balance_loss_mlp": 1.07761312, "diversity_loss_mlp": 0.0, "epoch": 0.529049634474798, "flos": 557320352256.0, "grad_norm": 0.08234082280170717, "language_loss": 0.86406553, "learning_rate": 0.00047726525259079777, "loss": 0.8749451, "num_input_tokens_seen": 229235792, "router_z_loss_mlp": 0.10345459, "routerloss_mlp": 0.0, "step": 2750, "time_per_iteration": 2.8415229320526123 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00831428, "balance_loss_mlp": 1.42309499, "diversity_loss_mlp": 0.21321589, "epoch": 0.5292420161600616, "flos": 581274952704.0, "grad_norm": 0.03400797212131273, "language_loss": 0.88723552, "learning_rate": 0.0004769540369337798, "loss": 0.89554983, "num_input_tokens_seen": 229309984, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01327293, "step": 2751, "time_per_iteration": 2.752032518386841 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100532, "balance_loss_mlp": 1.09000587, "diversity_loss_mlp": 0.0, "epoch": 0.5294343978453251, "flos": 608303167488.0, "grad_norm": 0.06288245154731438, "language_loss": 0.85769415, "learning_rate": 0.00047664283022399794, "loss": 0.86869949, "num_input_tokens_seen": 229394000, "router_z_loss_mlp": 0.10534668, "routerloss_mlp": 0.0, "step": 2752, "time_per_iteration": 2.8568003177642822 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01107464, "balance_loss_mlp": 1.09725976, "diversity_loss_mlp": 0.0, "epoch": 0.5296267795305887, "flos": 646522020864.0, "grad_norm": 0.0883883166781065, "language_loss": 0.80924225, "learning_rate": 0.00047633163258227376, "loss": 0.82031691, "num_input_tokens_seen": 229474320, "router_z_loss_mlp": 0.10205078, "routerloss_mlp": 0.0, "step": 2753, "time_per_iteration": 2.8275938034057617 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01104119, "balance_loss_mlp": 1.09359312, "diversity_loss_mlp": 0.0, "epoch": 0.5298191612158523, "flos": 559746796032.0, "grad_norm": 0.06733658380062774, "language_loss": 0.85417688, "learning_rate": 0.0004760204441294247, "loss": 0.86521804, "num_input_tokens_seen": 229543072, "router_z_loss_mlp": 0.10534668, "routerloss_mlp": 0.0, "step": 2754, "time_per_iteration": 2.6338090896606445 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01104137, "balance_loss_mlp": 1.09376574, "diversity_loss_mlp": 0.0, "epoch": 0.5300115429011159, "flos": 514046352384.0, "grad_norm": 0.06936353635633287, "language_loss": 0.85999346, "learning_rate": 0.00047570926498626486, "loss": 0.87103486, "num_input_tokens_seen": 229615296, "router_z_loss_mlp": 0.10375977, "routerloss_mlp": 0.0, "step": 2755, "time_per_iteration": 2.716575860977173 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01108637, "balance_loss_mlp": 1.09822416, "diversity_loss_mlp": 0.0, "epoch": 0.5302039245863793, "flos": 672789265920.0, "grad_norm": 0.061285448286525046, "language_loss": 0.81361842, "learning_rate": 0.00047539809527360474, "loss": 0.82470477, "num_input_tokens_seen": 229693728, "router_z_loss_mlp": 0.10412598, "routerloss_mlp": 0.0, "step": 2756, "time_per_iteration": 2.881225109100342 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01102434, "balance_loss_mlp": 1.0919373, "diversity_loss_mlp": 0.0, "epoch": 0.5303963062716429, "flos": 730836297216.0, "grad_norm": 0.05865021558391441, "language_loss": 0.82642096, "learning_rate": 0.0004750869351122511, "loss": 0.83744538, "num_input_tokens_seen": 229772144, "router_z_loss_mlp": 0.1050415, "routerloss_mlp": 0.0, "step": 2757, "time_per_iteration": 2.9978790283203125 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01096362, "balance_loss_mlp": 1.08600891, "diversity_loss_mlp": 0.0, "epoch": 0.5305886879569065, "flos": 573435085824.0, "grad_norm": 0.07787390265260127, "language_loss": 0.81663013, "learning_rate": 0.00047477578462300685, "loss": 0.82759368, "num_input_tokens_seen": 229847024, "router_z_loss_mlp": 0.10357666, "routerloss_mlp": 0.0, "step": 2758, "time_per_iteration": 2.700833797454834 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090285, "balance_loss_mlp": 1.07975245, "diversity_loss_mlp": 0.0, "epoch": 0.5307810696421701, "flos": 695335352832.0, "grad_norm": 0.069319292192906, "language_loss": 0.80022508, "learning_rate": 0.0004744646439266718, "loss": 0.81112796, "num_input_tokens_seen": 229932416, "router_z_loss_mlp": 0.10528564, "routerloss_mlp": 0.0, "step": 2759, "time_per_iteration": 3.0144033432006836 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084993, "balance_loss_mlp": 1.07477677, "diversity_loss_mlp": 0.0, "epoch": 0.5309734513274337, "flos": 648943322112.0, "grad_norm": 0.05678736813253772, "language_loss": 0.92058611, "learning_rate": 0.000474153513144041, "loss": 0.93143606, "num_input_tokens_seen": 230010976, "router_z_loss_mlp": 0.10223389, "routerloss_mlp": 0.0, "step": 2760, "time_per_iteration": 2.890305995941162 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082633, "balance_loss_mlp": 1.07224369, "diversity_loss_mlp": 0.0, "epoch": 0.5311658330126972, "flos": 604824288768.0, "grad_norm": 0.06975892982263965, "language_loss": 0.8659752, "learning_rate": 0.00047384239239590633, "loss": 0.87680155, "num_input_tokens_seen": 230093344, "router_z_loss_mlp": 0.10388184, "routerloss_mlp": 0.0, "step": 2761, "time_per_iteration": 2.864649772644043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076622, "balance_loss_mlp": 1.06607819, "diversity_loss_mlp": 0.0, "epoch": 0.5313582146979607, "flos": 558259361280.0, "grad_norm": 0.06592907525694008, "language_loss": 0.88956439, "learning_rate": 0.0004735312818030556, "loss": 0.90033066, "num_input_tokens_seen": 230165520, "router_z_loss_mlp": 0.10546875, "routerloss_mlp": 0.0, "step": 2762, "time_per_iteration": 2.7256298065185547 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079985, "balance_loss_mlp": 1.06967998, "diversity_loss_mlp": 0.0, "epoch": 0.5315505963832243, "flos": 508410473472.0, "grad_norm": 0.06903030148880929, "language_loss": 0.82737643, "learning_rate": 0.0004732201814862727, "loss": 0.83817625, "num_input_tokens_seen": 230237808, "router_z_loss_mlp": 0.10302734, "routerloss_mlp": 0.0, "step": 2763, "time_per_iteration": 2.785104990005493 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078579, "balance_loss_mlp": 1.0687145, "diversity_loss_mlp": 0.0, "epoch": 0.5317429780684879, "flos": 626439080448.0, "grad_norm": 0.07391416357546753, "language_loss": 0.81619537, "learning_rate": 0.0004729090915663373, "loss": 0.82698119, "num_input_tokens_seen": 230321568, "router_z_loss_mlp": 0.09857178, "routerloss_mlp": 0.0, "step": 2764, "time_per_iteration": 2.841716766357422 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00841129, "balance_loss_mlp": 1.43825924, "diversity_loss_mlp": 0.21717778, "epoch": 0.5319353597537514, "flos": 476744486400.0, "grad_norm": 0.03676047653681057, "language_loss": 0.84753668, "learning_rate": 0.00047259801216402534, "loss": 0.85594797, "num_input_tokens_seen": 230385376, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01341068, "step": 2765, "time_per_iteration": 2.5414865016937256 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078812, "balance_loss_mlp": 1.06872129, "diversity_loss_mlp": 0.0, "epoch": 0.532127741439015, "flos": 501635524608.0, "grad_norm": 0.08353685320939014, "language_loss": 0.86307138, "learning_rate": 0.00047228694340010845, "loss": 0.87385947, "num_input_tokens_seen": 230449760, "router_z_loss_mlp": 0.10089111, "routerloss_mlp": 0.0, "step": 2766, "time_per_iteration": 2.571230173110962 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083419, "balance_loss_mlp": 1.07304192, "diversity_loss_mlp": 0.0, "epoch": 0.5323201231242786, "flos": 1164586512384.0, "grad_norm": 0.07758433064211989, "language_loss": 0.85983396, "learning_rate": 0.0004719758853953544, "loss": 0.87066811, "num_input_tokens_seen": 230536592, "router_z_loss_mlp": 0.1038208, "routerloss_mlp": 0.0, "step": 2767, "time_per_iteration": 3.5577545166015625 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085303, "balance_loss_mlp": 1.07479465, "diversity_loss_mlp": 0.0, "epoch": 0.5325125048095422, "flos": 378702273024.0, "grad_norm": 0.08923013324738549, "language_loss": 0.83480549, "learning_rate": 0.00047166483827052645, "loss": 0.84565854, "num_input_tokens_seen": 230596688, "router_z_loss_mlp": 0.10510254, "routerloss_mlp": 0.0, "step": 2768, "time_per_iteration": 2.3904964923858643 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01014357, "balance_loss_mlp": 1.0088253, "diversity_loss_mlp": 0.0, "epoch": 0.5327048864948057, "flos": 1541353121280.0, "grad_norm": 0.015852342000118255, "language_loss": 0.77078491, "learning_rate": 0.00047135380214638413, "loss": 0.78092843, "num_input_tokens_seen": 230829408, "router_z_loss_mlp": 0.05541992, "routerloss_mlp": 0.0, "step": 2769, "time_per_iteration": 4.993681907653809 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100393, "balance_loss_mlp": 1.08974218, "diversity_loss_mlp": 0.0, "epoch": 0.5328972681800692, "flos": 911272923648.0, "grad_norm": 0.07499519146645399, "language_loss": 0.8344022, "learning_rate": 0.000471042777143682, "loss": 0.84540612, "num_input_tokens_seen": 230912528, "router_z_loss_mlp": 0.10656738, "routerloss_mlp": 0.0, "step": 2770, "time_per_iteration": 3.2187654972076416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099844, "balance_loss_mlp": 1.0895741, "diversity_loss_mlp": 0.0, "epoch": 0.5330896498653328, "flos": 473898097152.0, "grad_norm": 0.07177386868704265, "language_loss": 0.79602164, "learning_rate": 0.0004707317633831707, "loss": 0.80702007, "num_input_tokens_seen": 230979424, "router_z_loss_mlp": 0.10266113, "routerloss_mlp": 0.0, "step": 2771, "time_per_iteration": 2.5579092502593994 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097617, "balance_loss_mlp": 1.08694136, "diversity_loss_mlp": 0.0, "epoch": 0.5332820315505964, "flos": 501635524608.0, "grad_norm": 0.08358365289860634, "language_loss": 0.78326285, "learning_rate": 0.00047042076098559673, "loss": 0.79423904, "num_input_tokens_seen": 231046416, "router_z_loss_mlp": 0.10687256, "routerloss_mlp": 0.0, "step": 2772, "time_per_iteration": 2.6240808963775635 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089428, "balance_loss_mlp": 1.07924104, "diversity_loss_mlp": 0.0, "epoch": 0.53347441323586, "flos": 924439951872.0, "grad_norm": 0.07827879900232339, "language_loss": 0.7374208, "learning_rate": 0.00047010977007170174, "loss": 0.7483151, "num_input_tokens_seen": 231136064, "router_z_loss_mlp": 0.10186768, "routerloss_mlp": 0.0, "step": 2773, "time_per_iteration": 3.239807605743408 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108646, "balance_loss_mlp": 1.07606506, "diversity_loss_mlp": 0.0, "epoch": 0.5336667949211235, "flos": 574455587328.0, "grad_norm": 0.0770996892807777, "language_loss": 0.82462615, "learning_rate": 0.00046979879076222334, "loss": 0.83549076, "num_input_tokens_seen": 231203616, "router_z_loss_mlp": 0.10400391, "routerloss_mlp": 0.0, "step": 2774, "time_per_iteration": 2.6871917247772217 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081473, "balance_loss_mlp": 1.07122087, "diversity_loss_mlp": 0.0, "epoch": 0.533859176606387, "flos": 1064664082944.0, "grad_norm": 0.060681013844514214, "language_loss": 0.84932172, "learning_rate": 0.0004694878231778939, "loss": 0.86013645, "num_input_tokens_seen": 231287008, "router_z_loss_mlp": 0.10253906, "routerloss_mlp": 0.0, "step": 2775, "time_per_iteration": 3.3516969680786133 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083848, "balance_loss_mlp": 1.07336903, "diversity_loss_mlp": 0.0, "epoch": 0.5340515582916506, "flos": 746602665984.0, "grad_norm": 0.06561156947814625, "language_loss": 0.84353071, "learning_rate": 0.0004691768674394423, "loss": 0.85436922, "num_input_tokens_seen": 231365296, "router_z_loss_mlp": 0.1048584, "routerloss_mlp": 0.0, "step": 2776, "time_per_iteration": 2.9356815814971924 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010203, "balance_loss_mlp": 1.01491189, "diversity_loss_mlp": 0.0, "epoch": 0.5342439399769142, "flos": 1445685594624.0, "grad_norm": 0.017317997453326725, "language_loss": 0.84484011, "learning_rate": 0.0004688659236675918, "loss": 0.85504305, "num_input_tokens_seen": 231579040, "router_z_loss_mlp": 0.05395508, "routerloss_mlp": 0.0, "step": 2777, "time_per_iteration": 4.766932010650635 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01017275, "balance_loss_mlp": 1.01186275, "diversity_loss_mlp": 0.0, "epoch": 0.5344363216621778, "flos": 1427569505280.0, "grad_norm": 0.016201867017030143, "language_loss": 0.76653534, "learning_rate": 0.00046855499198306187, "loss": 0.77670807, "num_input_tokens_seen": 231812736, "router_z_loss_mlp": 0.05419922, "routerloss_mlp": 0.0, "step": 2778, "time_per_iteration": 5.022111177444458 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081648, "balance_loss_mlp": 1.07109189, "diversity_loss_mlp": 0.0, "epoch": 0.5346287033474413, "flos": 527618644992.0, "grad_norm": 0.08348606714079294, "language_loss": 0.79229748, "learning_rate": 0.00046824407250656676, "loss": 0.803114, "num_input_tokens_seen": 231883840, "router_z_loss_mlp": 0.10565186, "routerloss_mlp": 0.0, "step": 2779, "time_per_iteration": 2.6202685832977295 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079296, "balance_loss_mlp": 1.06859064, "diversity_loss_mlp": 0.0, "epoch": 0.5348210850327049, "flos": 510762765312.0, "grad_norm": 0.0812040646365834, "language_loss": 0.83481312, "learning_rate": 0.0004679331653588161, "loss": 0.84560603, "num_input_tokens_seen": 231955360, "router_z_loss_mlp": 0.1071167, "routerloss_mlp": 0.0, "step": 2780, "time_per_iteration": 2.6287879943847656 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083945, "balance_loss_mlp": 1.07337165, "diversity_loss_mlp": 0.0, "epoch": 0.5350134667179685, "flos": 462668184576.0, "grad_norm": 0.08148878126655458, "language_loss": 0.85570091, "learning_rate": 0.0004676222706605147, "loss": 0.86654037, "num_input_tokens_seen": 232027088, "router_z_loss_mlp": 0.10583496, "routerloss_mlp": 0.0, "step": 2781, "time_per_iteration": 2.634186029434204 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082358, "balance_loss_mlp": 1.07175457, "diversity_loss_mlp": 0.0, "epoch": 0.535205848403232, "flos": 708875712000.0, "grad_norm": 0.08561637601090062, "language_loss": 0.84885913, "learning_rate": 0.0004673113885323626, "loss": 0.85968268, "num_input_tokens_seen": 232099472, "router_z_loss_mlp": 0.10601807, "routerloss_mlp": 0.0, "step": 2782, "time_per_iteration": 2.839108943939209 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084061, "balance_loss_mlp": 1.07358241, "diversity_loss_mlp": 0.0, "epoch": 0.5353982300884956, "flos": 894241575936.0, "grad_norm": 0.0730092425976976, "language_loss": 0.78793383, "learning_rate": 0.00046700051909505494, "loss": 0.79877448, "num_input_tokens_seen": 232182528, "router_z_loss_mlp": 0.10479736, "routerloss_mlp": 0.0, "step": 2783, "time_per_iteration": 3.1548988819122314 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080824, "balance_loss_mlp": 1.06943369, "diversity_loss_mlp": 0.0, "epoch": 0.5355906117737591, "flos": 535965092352.0, "grad_norm": 0.06678731146909953, "language_loss": 0.84066731, "learning_rate": 0.000466689662469282, "loss": 0.85147554, "num_input_tokens_seen": 232253344, "router_z_loss_mlp": 0.11383057, "routerloss_mlp": 0.0, "step": 2784, "time_per_iteration": 2.6213507652282715 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082867, "balance_loss_mlp": 1.07235312, "diversity_loss_mlp": 0.0, "epoch": 0.5357829934590227, "flos": 868846528512.0, "grad_norm": 0.06931446022689573, "language_loss": 0.83996934, "learning_rate": 0.00046637881877572917, "loss": 0.85079801, "num_input_tokens_seen": 232337232, "router_z_loss_mlp": 0.10522461, "routerloss_mlp": 0.0, "step": 2785, "time_per_iteration": 3.1161208152770996 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084248, "balance_loss_mlp": 1.07350779, "diversity_loss_mlp": 0.0, "epoch": 0.5359753751442863, "flos": 553287905280.0, "grad_norm": 0.05978198327100757, "language_loss": 0.84824258, "learning_rate": 0.0004660679881350764, "loss": 0.85908508, "num_input_tokens_seen": 232412864, "router_z_loss_mlp": 0.10742188, "routerloss_mlp": 0.0, "step": 2786, "time_per_iteration": 2.7317774295806885 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043593, "balance_loss_mlp": 1.0375849, "diversity_loss_mlp": 0.0, "epoch": 0.5361677568295499, "flos": 1480499347968.0, "grad_norm": 0.025126940202686972, "language_loss": 0.75608146, "learning_rate": 0.0004657571706679988, "loss": 0.7665174, "num_input_tokens_seen": 232639888, "router_z_loss_mlp": 0.06005859, "routerloss_mlp": 0.0, "step": 2787, "time_per_iteration": 5.0151801109313965 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079952, "balance_loss_mlp": 1.06945598, "diversity_loss_mlp": 0.0, "epoch": 0.5363601385148133, "flos": 806255700480.0, "grad_norm": 0.07181749108152896, "language_loss": 0.78038859, "learning_rate": 0.0004654463664951667, "loss": 0.79118812, "num_input_tokens_seen": 232719248, "router_z_loss_mlp": 0.1050415, "routerloss_mlp": 0.0, "step": 2788, "time_per_iteration": 2.9862492084503174 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074852, "balance_loss_mlp": 1.06444538, "diversity_loss_mlp": 0.0, "epoch": 0.5365525202000769, "flos": 507879300096.0, "grad_norm": 0.06160548649513732, "language_loss": 0.83008492, "learning_rate": 0.0004651355757372447, "loss": 0.84083349, "num_input_tokens_seen": 232788464, "router_z_loss_mlp": 0.10400391, "routerloss_mlp": 0.0, "step": 2789, "time_per_iteration": 2.6209347248077393 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00838367, "balance_loss_mlp": 1.43426061, "diversity_loss_mlp": 0.2158158, "epoch": 0.5367449018853405, "flos": 528930611712.0, "grad_norm": 0.029696530744324656, "language_loss": 0.8589375, "learning_rate": 0.00046482479851489274, "loss": 0.86732113, "num_input_tokens_seen": 232859792, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01332852, "step": 2790, "time_per_iteration": 2.6991934776306152 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077231, "balance_loss_mlp": 1.06660962, "diversity_loss_mlp": 0.0, "epoch": 0.5369372835706041, "flos": 649934088192.0, "grad_norm": 0.09378702232215988, "language_loss": 0.77937293, "learning_rate": 0.00046451403494876525, "loss": 0.79014528, "num_input_tokens_seen": 232941472, "router_z_loss_mlp": 0.10632324, "routerloss_mlp": 0.0, "step": 2791, "time_per_iteration": 2.8735973834991455 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070268, "balance_loss_mlp": 1.05943799, "diversity_loss_mlp": 0.0, "epoch": 0.5371296652558677, "flos": 584489530368.0, "grad_norm": 0.07434319158841775, "language_loss": 0.84554839, "learning_rate": 0.0004642032851595111, "loss": 0.85625106, "num_input_tokens_seen": 233017120, "router_z_loss_mlp": 0.1083374, "routerloss_mlp": 0.0, "step": 2792, "time_per_iteration": 2.7458536624908447 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065699, "balance_loss_mlp": 1.05472004, "diversity_loss_mlp": 0.0, "epoch": 0.5373220469411312, "flos": 595872516096.0, "grad_norm": 0.06545464420604186, "language_loss": 0.85163087, "learning_rate": 0.00046389254926777404, "loss": 0.86228788, "num_input_tokens_seen": 233095408, "router_z_loss_mlp": 0.10980225, "routerloss_mlp": 0.0, "step": 2793, "time_per_iteration": 2.823887825012207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062925, "balance_loss_mlp": 1.0519762, "diversity_loss_mlp": 0.0, "epoch": 0.5375144286263948, "flos": 1114426335744.0, "grad_norm": 0.06502650627416932, "language_loss": 0.78292251, "learning_rate": 0.0004635818273941926, "loss": 0.79355174, "num_input_tokens_seen": 233191056, "router_z_loss_mlp": 0.10955811, "routerloss_mlp": 0.0, "step": 2794, "time_per_iteration": 3.569359302520752 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058844, "balance_loss_mlp": 1.04798412, "diversity_loss_mlp": 0.0, "epoch": 0.5377068103116583, "flos": 595608215040.0, "grad_norm": 0.0851115940139546, "language_loss": 0.81696212, "learning_rate": 0.0004632711196593997, "loss": 0.82755053, "num_input_tokens_seen": 233265536, "router_z_loss_mlp": 0.10876465, "routerloss_mlp": 0.0, "step": 2795, "time_per_iteration": 2.763248920440674 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059516, "balance_loss_mlp": 1.04872167, "diversity_loss_mlp": 0.0, "epoch": 0.5378991919969219, "flos": 884200292352.0, "grad_norm": 0.08577601840657965, "language_loss": 0.85307401, "learning_rate": 0.00046296042618402297, "loss": 0.86366916, "num_input_tokens_seen": 233348224, "router_z_loss_mlp": 0.10791016, "routerloss_mlp": 0.0, "step": 2796, "time_per_iteration": 3.059995651245117 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065204, "balance_loss_mlp": 1.05436158, "diversity_loss_mlp": 0.0, "epoch": 0.5380915736821854, "flos": 710664523776.0, "grad_norm": 0.05816929772054262, "language_loss": 0.79285312, "learning_rate": 0.0004626497470886839, "loss": 0.80350512, "num_input_tokens_seen": 233429344, "router_z_loss_mlp": 0.10845947, "routerloss_mlp": 0.0, "step": 2797, "time_per_iteration": 2.9551138877868652 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059667, "balance_loss_mlp": 1.04897988, "diversity_loss_mlp": 0.0, "epoch": 0.538283955367449, "flos": 556999151616.0, "grad_norm": 0.06686475877008137, "language_loss": 0.82082057, "learning_rate": 0.00046233908249399897, "loss": 0.83141726, "num_input_tokens_seen": 233504944, "router_z_loss_mlp": 0.10693359, "routerloss_mlp": 0.0, "step": 2798, "time_per_iteration": 2.7494163513183594 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071071, "balance_loss_mlp": 1.06012726, "diversity_loss_mlp": 0.0, "epoch": 0.5384763370527126, "flos": 513470762496.0, "grad_norm": 0.06311972638358435, "language_loss": 0.78919041, "learning_rate": 0.00046202843252057905, "loss": 0.79990107, "num_input_tokens_seen": 233573072, "router_z_loss_mlp": 0.10943604, "routerloss_mlp": 0.0, "step": 2799, "time_per_iteration": 2.586824655532837 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076382, "balance_loss_mlp": 1.06545627, "diversity_loss_mlp": 0.0, "epoch": 0.5386687187379762, "flos": 489736046592.0, "grad_norm": 0.06763496495115903, "language_loss": 0.83705521, "learning_rate": 0.00046171779728902896, "loss": 0.84781897, "num_input_tokens_seen": 233640896, "router_z_loss_mlp": 0.109375, "routerloss_mlp": 0.0, "step": 2800, "time_per_iteration": 2.5922951698303223 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084728, "balance_loss_mlp": 1.07354665, "diversity_loss_mlp": 0.0, "epoch": 0.5388611004232398, "flos": 482657149440.0, "grad_norm": 0.12725923305511472, "language_loss": 0.86135888, "learning_rate": 0.000461407176919948, "loss": 0.87220615, "num_input_tokens_seen": 233703904, "router_z_loss_mlp": 0.11181641, "routerloss_mlp": 0.0, "step": 2801, "time_per_iteration": 2.532080888748169 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085364, "balance_loss_mlp": 1.07459974, "diversity_loss_mlp": 0.0, "epoch": 0.5390534821085032, "flos": 560984610816.0, "grad_norm": 0.08372818850883645, "language_loss": 0.85317719, "learning_rate": 0.00046109657153392997, "loss": 0.8640309, "num_input_tokens_seen": 233779248, "router_z_loss_mlp": 0.10772705, "routerloss_mlp": 0.0, "step": 2802, "time_per_iteration": 2.7498726844787598 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082922, "balance_loss_mlp": 1.07185912, "diversity_loss_mlp": 0.0, "epoch": 0.5392458637937668, "flos": 488377092096.0, "grad_norm": 0.07972844989907181, "language_loss": 0.82981819, "learning_rate": 0.0004607859812515622, "loss": 0.84064734, "num_input_tokens_seen": 233847520, "router_z_loss_mlp": 0.11071777, "routerloss_mlp": 0.0, "step": 2803, "time_per_iteration": 2.5823397636413574 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077838, "balance_loss_mlp": 1.06679916, "diversity_loss_mlp": 0.0, "epoch": 0.5394382454790304, "flos": 512057479680.0, "grad_norm": 0.06982591680837838, "language_loss": 0.88185596, "learning_rate": 0.00046047540619342667, "loss": 0.89263427, "num_input_tokens_seen": 233911328, "router_z_loss_mlp": 0.1104126, "routerloss_mlp": 0.0, "step": 2804, "time_per_iteration": 2.582594156265259 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089122, "balance_loss_mlp": 1.07845902, "diversity_loss_mlp": 0.0, "epoch": 0.539630627164294, "flos": 567586662912.0, "grad_norm": 0.06923180186476277, "language_loss": 0.80359995, "learning_rate": 0.00046016484648009933, "loss": 0.81449121, "num_input_tokens_seen": 233987104, "router_z_loss_mlp": 0.10675049, "routerloss_mlp": 0.0, "step": 2805, "time_per_iteration": 2.705085277557373 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082055, "balance_loss_mlp": 1.0713259, "diversity_loss_mlp": 0.0, "epoch": 0.5398230088495575, "flos": 526462322688.0, "grad_norm": 0.06938884531628577, "language_loss": 0.81049907, "learning_rate": 0.0004598543022321501, "loss": 0.82131958, "num_input_tokens_seen": 234057216, "router_z_loss_mlp": 0.10736084, "routerloss_mlp": 0.0, "step": 2806, "time_per_iteration": 2.6722495555877686 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00855076, "balance_loss_mlp": 1.46593428, "diversity_loss_mlp": 0.21781196, "epoch": 0.5400153905348211, "flos": 538764493824.0, "grad_norm": 0.030466031644405155, "language_loss": 0.79783833, "learning_rate": 0.0004595437735701433, "loss": 0.80638903, "num_input_tokens_seen": 234129984, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01320273, "step": 2807, "time_per_iteration": 2.734110116958618 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088135, "balance_loss_mlp": 1.07728648, "diversity_loss_mlp": 0.0, "epoch": 0.5402077722200846, "flos": 513539771904.0, "grad_norm": 0.08474622827734493, "language_loss": 0.83849192, "learning_rate": 0.00045923326061463623, "loss": 0.84937334, "num_input_tokens_seen": 234203920, "router_z_loss_mlp": 0.10858154, "routerloss_mlp": 0.0, "step": 2808, "time_per_iteration": 2.7606189250946045 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089545, "balance_loss_mlp": 1.07878006, "diversity_loss_mlp": 0.0, "epoch": 0.5404001539053482, "flos": 676258232832.0, "grad_norm": 0.06442619071995537, "language_loss": 0.8173002, "learning_rate": 0.00045892276348618113, "loss": 0.82819563, "num_input_tokens_seen": 234285440, "router_z_loss_mlp": 0.10772705, "routerloss_mlp": 0.0, "step": 2809, "time_per_iteration": 2.9691591262817383 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01033956, "balance_loss_mlp": 1.02887774, "diversity_loss_mlp": 0.0, "epoch": 0.5405925355906118, "flos": 1554834009600.0, "grad_norm": 0.01908051648382603, "language_loss": 0.78260827, "learning_rate": 0.0004586122823053235, "loss": 0.79294789, "num_input_tokens_seen": 234521424, "router_z_loss_mlp": 0.05078125, "routerloss_mlp": 0.0, "step": 2810, "time_per_iteration": 4.957923173904419 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089256, "balance_loss_mlp": 1.07848597, "diversity_loss_mlp": 0.0, "epoch": 0.5407849172758753, "flos": 647310154752.0, "grad_norm": 0.05960464217413758, "language_loss": 0.80596066, "learning_rate": 0.000458301817192603, "loss": 0.81685317, "num_input_tokens_seen": 234601632, "router_z_loss_mlp": 0.10778809, "routerloss_mlp": 0.0, "step": 2811, "time_per_iteration": 2.852247714996338 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01021724, "balance_loss_mlp": 1.0165503, "diversity_loss_mlp": 0.0, "epoch": 0.5409772989611389, "flos": 1407407643648.0, "grad_norm": 0.015447521326512613, "language_loss": 0.8084178, "learning_rate": 0.00045799136826855263, "loss": 0.81863511, "num_input_tokens_seen": 234825776, "router_z_loss_mlp": 0.05175781, "routerloss_mlp": 0.0, "step": 2812, "time_per_iteration": 4.808724880218506 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080487, "balance_loss_mlp": 1.06993747, "diversity_loss_mlp": 0.0, "epoch": 0.5411696806464025, "flos": 554389899264.0, "grad_norm": 0.06805695837678187, "language_loss": 0.87130654, "learning_rate": 0.00045768093565369983, "loss": 0.88211143, "num_input_tokens_seen": 234901504, "router_z_loss_mlp": 0.10552979, "routerloss_mlp": 0.0, "step": 2813, "time_per_iteration": 2.7794101238250732 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090945, "balance_loss_mlp": 1.08034182, "diversity_loss_mlp": 0.0, "epoch": 0.5413620623316661, "flos": 528122654208.0, "grad_norm": 0.06578755075233327, "language_loss": 0.8208549, "learning_rate": 0.0004573705194685646, "loss": 0.83176434, "num_input_tokens_seen": 234970288, "router_z_loss_mlp": 0.1060791, "routerloss_mlp": 0.0, "step": 2814, "time_per_iteration": 2.686871290206909 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084437, "balance_loss_mlp": 1.07364845, "diversity_loss_mlp": 0.0, "epoch": 0.5415544440169295, "flos": 598741300224.0, "grad_norm": 0.07321549809116977, "language_loss": 0.84966654, "learning_rate": 0.00045706011983366157, "loss": 0.86051095, "num_input_tokens_seen": 235039984, "router_z_loss_mlp": 0.10784912, "routerloss_mlp": 0.0, "step": 2815, "time_per_iteration": 2.676772117614746 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00843207, "balance_loss_mlp": 1.44560027, "diversity_loss_mlp": 0.21445701, "epoch": 0.5417468257021931, "flos": 470757671424.0, "grad_norm": 0.03775972378408833, "language_loss": 0.82685602, "learning_rate": 0.00045674973686949847, "loss": 0.83528805, "num_input_tokens_seen": 235105232, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01317827, "step": 2816, "time_per_iteration": 2.548164129257202 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079521, "balance_loss_mlp": 1.06887531, "diversity_loss_mlp": 0.0, "epoch": 0.5419392073874567, "flos": 680819281920.0, "grad_norm": 0.06715248152064907, "language_loss": 0.85478067, "learning_rate": 0.0004564393706965766, "loss": 0.86557591, "num_input_tokens_seen": 235192560, "router_z_loss_mlp": 0.10656738, "routerloss_mlp": 0.0, "step": 2817, "time_per_iteration": 2.9715416431427 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078759, "balance_loss_mlp": 1.06789875, "diversity_loss_mlp": 0.0, "epoch": 0.5421315890727203, "flos": 462374148096.0, "grad_norm": 0.07300594242261846, "language_loss": 0.81410033, "learning_rate": 0.00045612902143539116, "loss": 0.82488787, "num_input_tokens_seen": 235258448, "router_z_loss_mlp": 0.10864258, "routerloss_mlp": 0.0, "step": 2818, "time_per_iteration": 2.5861568450927734 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069694, "balance_loss_mlp": 1.05926371, "diversity_loss_mlp": 0.0, "epoch": 0.5423239707579839, "flos": 436959277056.0, "grad_norm": 0.07796543703625758, "language_loss": 0.8169418, "learning_rate": 0.00045581868920642986, "loss": 0.82763875, "num_input_tokens_seen": 235322176, "router_z_loss_mlp": 0.10437012, "routerloss_mlp": 0.0, "step": 2819, "time_per_iteration": 2.495675563812256 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079235, "balance_loss_mlp": 1.06864905, "diversity_loss_mlp": 0.0, "epoch": 0.5425163524432474, "flos": 458314536960.0, "grad_norm": 0.08284985931126, "language_loss": 0.79605496, "learning_rate": 0.00045550837413017457, "loss": 0.80684733, "num_input_tokens_seen": 235390960, "router_z_loss_mlp": 0.105896, "routerloss_mlp": 0.0, "step": 2820, "time_per_iteration": 2.5968475341796875 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081511, "balance_loss_mlp": 1.07137275, "diversity_loss_mlp": 0.0, "epoch": 0.542708734128511, "flos": 419495500800.0, "grad_norm": 0.06853869944040722, "language_loss": 0.85501075, "learning_rate": 0.0004551980763271005, "loss": 0.86582589, "num_input_tokens_seen": 235460976, "router_z_loss_mlp": 0.10137939, "routerloss_mlp": 0.0, "step": 2821, "time_per_iteration": 2.6689629554748535 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080137, "balance_loss_mlp": 1.06970072, "diversity_loss_mlp": 0.0, "epoch": 0.5429011158137745, "flos": 678454880256.0, "grad_norm": 0.07047505467714002, "language_loss": 0.83788973, "learning_rate": 0.0004548877959176756, "loss": 0.84869111, "num_input_tokens_seen": 235540912, "router_z_loss_mlp": 0.10443115, "routerloss_mlp": 0.0, "step": 2822, "time_per_iteration": 2.8898305892944336 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079374, "balance_loss_mlp": 1.06903815, "diversity_loss_mlp": 0.0, "epoch": 0.5430934974990381, "flos": 540924065280.0, "grad_norm": 0.06782192405371351, "language_loss": 0.86297488, "learning_rate": 0.00045457753302236166, "loss": 0.87376869, "num_input_tokens_seen": 235608736, "router_z_loss_mlp": 0.10339355, "routerloss_mlp": 0.0, "step": 2823, "time_per_iteration": 2.626262903213501 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087962, "balance_loss_mlp": 1.07755554, "diversity_loss_mlp": 0.0, "epoch": 0.5432858791843016, "flos": 658468486656.0, "grad_norm": 0.07336203540826484, "language_loss": 0.87131381, "learning_rate": 0.00045426728776161353, "loss": 0.88219345, "num_input_tokens_seen": 235678720, "router_z_loss_mlp": 0.10412598, "routerloss_mlp": 0.0, "step": 2824, "time_per_iteration": 2.7630255222320557 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085716, "balance_loss_mlp": 1.07529116, "diversity_loss_mlp": 0.0, "epoch": 0.5434782608695652, "flos": 531935216640.0, "grad_norm": 0.07766893457840997, "language_loss": 0.81382459, "learning_rate": 0.00045395706025587863, "loss": 0.82468176, "num_input_tokens_seen": 235748704, "router_z_loss_mlp": 0.10424805, "routerloss_mlp": 0.0, "step": 2825, "time_per_iteration": 2.653036594390869 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070472, "balance_loss_mlp": 1.05976105, "diversity_loss_mlp": 0.0, "epoch": 0.5436706425548288, "flos": 608501030400.0, "grad_norm": 0.08392292239142347, "language_loss": 0.82965428, "learning_rate": 0.00045364685062559843, "loss": 0.84035897, "num_input_tokens_seen": 235828224, "router_z_loss_mlp": 0.10717773, "routerloss_mlp": 0.0, "step": 2826, "time_per_iteration": 2.8091156482696533 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075397, "balance_loss_mlp": 1.06498957, "diversity_loss_mlp": 0.0, "epoch": 0.5438630242400924, "flos": 705418854912.0, "grad_norm": 0.06510139608888613, "language_loss": 0.91622829, "learning_rate": 0.0004533366589912067, "loss": 0.92698228, "num_input_tokens_seen": 235909392, "router_z_loss_mlp": 0.10412598, "routerloss_mlp": 0.0, "step": 2827, "time_per_iteration": 2.949005365371704 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075847, "balance_loss_mlp": 1.06538677, "diversity_loss_mlp": 0.0, "epoch": 0.544055405925356, "flos": 856425788928.0, "grad_norm": 0.07049343673366977, "language_loss": 0.77641904, "learning_rate": 0.0004530264854731306, "loss": 0.78717756, "num_input_tokens_seen": 235983888, "router_z_loss_mlp": 0.10461426, "routerloss_mlp": 0.0, "step": 2828, "time_per_iteration": 3.054252862930298 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079568, "balance_loss_mlp": 1.06920242, "diversity_loss_mlp": 0.0, "epoch": 0.5442477876106194, "flos": 571779523584.0, "grad_norm": 0.05986165572949975, "language_loss": 0.84122354, "learning_rate": 0.00045271633019179034, "loss": 0.85201919, "num_input_tokens_seen": 236063056, "router_z_loss_mlp": 0.10369873, "routerloss_mlp": 0.0, "step": 2829, "time_per_iteration": 2.788818836212158 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077904, "balance_loss_mlp": 1.06762242, "diversity_loss_mlp": 0.0, "epoch": 0.544440169295883, "flos": 625556971008.0, "grad_norm": 0.05963281032217842, "language_loss": 0.87701666, "learning_rate": 0.0004524061932675986, "loss": 0.88779569, "num_input_tokens_seen": 236141104, "router_z_loss_mlp": 0.10284424, "routerloss_mlp": 0.0, "step": 2830, "time_per_iteration": 2.861154079437256 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073509, "balance_loss_mlp": 1.06306028, "diversity_loss_mlp": 0.0, "epoch": 0.5446325509811466, "flos": 836244103680.0, "grad_norm": 0.11132414831600651, "language_loss": 0.87095535, "learning_rate": 0.00045209607482096125, "loss": 0.88169038, "num_input_tokens_seen": 236220320, "router_z_loss_mlp": 0.10455322, "routerloss_mlp": 0.0, "step": 2831, "time_per_iteration": 3.041248321533203 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107632, "balance_loss_mlp": 1.06573415, "diversity_loss_mlp": 0.0, "epoch": 0.5448249326664102, "flos": 483381043200.0, "grad_norm": 0.07049073021000962, "language_loss": 0.84385192, "learning_rate": 0.0004517859749722772, "loss": 0.85461509, "num_input_tokens_seen": 236288208, "router_z_loss_mlp": 0.105896, "routerloss_mlp": 0.0, "step": 2832, "time_per_iteration": 2.663478374481201 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075242, "balance_loss_mlp": 1.0643816, "diversity_loss_mlp": 0.0, "epoch": 0.5450173143516738, "flos": 561107948544.0, "grad_norm": 0.06386820666055518, "language_loss": 0.79316235, "learning_rate": 0.0004514758938419376, "loss": 0.80391467, "num_input_tokens_seen": 236366864, "router_z_loss_mlp": 0.10870361, "routerloss_mlp": 0.0, "step": 2833, "time_per_iteration": 2.8141582012176514 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104275, "balance_loss_mlp": 1.03721869, "diversity_loss_mlp": 0.0, "epoch": 0.5452096960369373, "flos": 1470420988416.0, "grad_norm": 0.027736452139364785, "language_loss": 0.76920587, "learning_rate": 0.0004511658315503268, "loss": 0.77963334, "num_input_tokens_seen": 236597120, "router_z_loss_mlp": 0.05541992, "routerloss_mlp": 0.0, "step": 2834, "time_per_iteration": 4.960749864578247 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075071, "balance_loss_mlp": 1.06446719, "diversity_loss_mlp": 0.0, "epoch": 0.5454020777222008, "flos": 465064892928.0, "grad_norm": 0.06436328535255592, "language_loss": 0.83993077, "learning_rate": 0.00045085578821782175, "loss": 0.85068148, "num_input_tokens_seen": 236664192, "router_z_loss_mlp": 0.1060791, "routerloss_mlp": 0.0, "step": 2835, "time_per_iteration": 2.6025185585021973 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01020548, "balance_loss_mlp": 1.01516008, "diversity_loss_mlp": 0.0, "epoch": 0.5455944594074644, "flos": 1469657820672.0, "grad_norm": 0.015651807900939278, "language_loss": 0.76134741, "learning_rate": 0.0004505457639647917, "loss": 0.77155292, "num_input_tokens_seen": 236888784, "router_z_loss_mlp": 0.05395508, "routerloss_mlp": 0.0, "step": 2836, "time_per_iteration": 4.911514043807983 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079494, "balance_loss_mlp": 1.06864595, "diversity_loss_mlp": 0.0, "epoch": 0.545786841092728, "flos": 533180371968.0, "grad_norm": 0.05502946705999508, "language_loss": 0.81078947, "learning_rate": 0.00045023575891159866, "loss": 0.82158434, "num_input_tokens_seen": 236962528, "router_z_loss_mlp": 0.10852051, "routerloss_mlp": 0.0, "step": 2837, "time_per_iteration": 2.7158284187316895 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01008506, "balance_loss_mlp": 1.00321293, "diversity_loss_mlp": 0.0, "epoch": 0.5459792227779915, "flos": 1352389810176.0, "grad_norm": 0.010060791837063862, "language_loss": 0.74763811, "learning_rate": 0.00044992577317859764, "loss": 0.75772309, "num_input_tokens_seen": 237179360, "router_z_loss_mlp": 0.05297852, "routerloss_mlp": 0.0, "step": 2838, "time_per_iteration": 4.9448912143707275 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078413, "balance_loss_mlp": 1.06803036, "diversity_loss_mlp": 0.0, "epoch": 0.5461716044632551, "flos": 637881537024.0, "grad_norm": 0.059936217606746015, "language_loss": 0.78111225, "learning_rate": 0.0004496158068861354, "loss": 0.79189646, "num_input_tokens_seen": 237256240, "router_z_loss_mlp": 0.1038208, "routerloss_mlp": 0.0, "step": 2839, "time_per_iteration": 2.8019115924835205 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081531, "balance_loss_mlp": 1.07090366, "diversity_loss_mlp": 0.0, "epoch": 0.5463639861485187, "flos": 602751352320.0, "grad_norm": 0.06804602152838367, "language_loss": 0.80713242, "learning_rate": 0.00044930586015455207, "loss": 0.81794775, "num_input_tokens_seen": 237334272, "router_z_loss_mlp": 0.10638428, "routerloss_mlp": 0.0, "step": 2840, "time_per_iteration": 2.771359443664551 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076777, "balance_loss_mlp": 1.06646562, "diversity_loss_mlp": 0.0, "epoch": 0.5465563678337823, "flos": 642516738048.0, "grad_norm": 0.0578733121218936, "language_loss": 0.88904727, "learning_rate": 0.000448995933104179, "loss": 0.89981508, "num_input_tokens_seen": 237415408, "router_z_loss_mlp": 0.10314941, "routerloss_mlp": 0.0, "step": 2841, "time_per_iteration": 2.8486392498016357 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081465, "balance_loss_mlp": 1.07075977, "diversity_loss_mlp": 0.0, "epoch": 0.5467487495190458, "flos": 614154161664.0, "grad_norm": 0.07392730491467848, "language_loss": 0.80162299, "learning_rate": 0.00044868602585534077, "loss": 0.81243765, "num_input_tokens_seen": 237493232, "router_z_loss_mlp": 0.10699463, "routerloss_mlp": 0.0, "step": 2842, "time_per_iteration": 2.8463480472564697 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074348, "balance_loss_mlp": 1.06379187, "diversity_loss_mlp": 0.0, "epoch": 0.5469411312043093, "flos": 461190661632.0, "grad_norm": 0.0858024928700591, "language_loss": 0.89360344, "learning_rate": 0.0004483761385283541, "loss": 0.90434694, "num_input_tokens_seen": 237556624, "router_z_loss_mlp": 0.10565186, "routerloss_mlp": 0.0, "step": 2843, "time_per_iteration": 2.534032106399536 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00870358, "balance_loss_mlp": 1.4994092, "diversity_loss_mlp": 0.21570696, "epoch": 0.5471335128895729, "flos": 561197154816.0, "grad_norm": 0.030684440159293704, "language_loss": 0.8165319, "learning_rate": 0.0004480662712435281, "loss": 0.82523549, "num_input_tokens_seen": 237632048, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01280049, "step": 2844, "time_per_iteration": 2.7523300647735596 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081741, "balance_loss_mlp": 1.07085109, "diversity_loss_mlp": 0.0, "epoch": 0.5473258945748365, "flos": 518686695936.0, "grad_norm": 0.08261462073704483, "language_loss": 0.88389564, "learning_rate": 0.0004477564241211635, "loss": 0.89471304, "num_input_tokens_seen": 237699840, "router_z_loss_mlp": 0.10888672, "routerloss_mlp": 0.0, "step": 2845, "time_per_iteration": 2.5676896572113037 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068187, "balance_loss_mlp": 1.0573566, "diversity_loss_mlp": 0.0, "epoch": 0.5475182762601001, "flos": 433828763136.0, "grad_norm": 0.07762403474355188, "language_loss": 0.868963, "learning_rate": 0.0004474465972815541, "loss": 0.87964487, "num_input_tokens_seen": 237762560, "router_z_loss_mlp": 0.10839844, "routerloss_mlp": 0.0, "step": 2846, "time_per_iteration": 2.4843738079071045 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073996, "balance_loss_mlp": 1.06337464, "diversity_loss_mlp": 0.0, "epoch": 0.5477106579453636, "flos": 511560811008.0, "grad_norm": 0.05857404260801407, "language_loss": 0.87612844, "learning_rate": 0.000447136790844985, "loss": 0.88686836, "num_input_tokens_seen": 237837152, "router_z_loss_mlp": 0.10626221, "routerloss_mlp": 0.0, "step": 2847, "time_per_iteration": 2.659214973449707 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068933, "balance_loss_mlp": 1.05774474, "diversity_loss_mlp": 0.0, "epoch": 0.5479030396306271, "flos": 675912439296.0, "grad_norm": 0.0657788254057266, "language_loss": 0.80922693, "learning_rate": 0.00044682700493173385, "loss": 0.81991625, "num_input_tokens_seen": 237909488, "router_z_loss_mlp": 0.11187744, "routerloss_mlp": 0.0, "step": 2848, "time_per_iteration": 2.8093039989471436 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071859, "balance_loss_mlp": 1.06077814, "diversity_loss_mlp": 0.0, "epoch": 0.5480954213158907, "flos": 876090981888.0, "grad_norm": 0.06921376228249611, "language_loss": 0.80399549, "learning_rate": 0.00044651723966207004, "loss": 0.81471407, "num_input_tokens_seen": 237991056, "router_z_loss_mlp": 0.11090088, "routerloss_mlp": 0.0, "step": 2849, "time_per_iteration": 3.1084961891174316 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069124, "balance_loss_mlp": 1.05826974, "diversity_loss_mlp": 0.0, "epoch": 0.5482878030011543, "flos": 622006511616.0, "grad_norm": 0.06382752106805908, "language_loss": 0.78137773, "learning_rate": 0.00044620749515625536, "loss": 0.79206896, "num_input_tokens_seen": 238064576, "router_z_loss_mlp": 0.10858154, "routerloss_mlp": 0.0, "step": 2850, "time_per_iteration": 2.8127682209014893 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065938, "balance_loss_mlp": 1.05505395, "diversity_loss_mlp": 0.0, "epoch": 0.5484801846864179, "flos": 497207725056.0, "grad_norm": 0.07084116902380141, "language_loss": 0.85142213, "learning_rate": 0.00044589777153454334, "loss": 0.86208153, "num_input_tokens_seen": 238136464, "router_z_loss_mlp": 0.10888672, "routerloss_mlp": 0.0, "step": 2851, "time_per_iteration": 2.7690277099609375 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063836, "balance_loss_mlp": 1.05239749, "diversity_loss_mlp": 0.0, "epoch": 0.5486725663716814, "flos": 442432171008.0, "grad_norm": 0.06308922523972363, "language_loss": 0.83850712, "learning_rate": 0.00044558806891717895, "loss": 0.84914547, "num_input_tokens_seen": 238198912, "router_z_loss_mlp": 0.11450195, "routerloss_mlp": 0.0, "step": 2852, "time_per_iteration": 2.542076587677002 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066106, "balance_loss_mlp": 1.05529404, "diversity_loss_mlp": 0.0, "epoch": 0.548864948056945, "flos": 655162504704.0, "grad_norm": 0.06000502851088379, "language_loss": 0.79783493, "learning_rate": 0.0004452783874243998, "loss": 0.808496, "num_input_tokens_seen": 238275184, "router_z_loss_mlp": 0.1081543, "routerloss_mlp": 0.0, "step": 2853, "time_per_iteration": 2.8680150508880615 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070657, "balance_loss_mlp": 1.06022012, "diversity_loss_mlp": 0.0, "epoch": 0.5490573297422086, "flos": 546036111360.0, "grad_norm": 0.07387916596955035, "language_loss": 0.84572864, "learning_rate": 0.00044496872717643475, "loss": 0.85643518, "num_input_tokens_seen": 238348496, "router_z_loss_mlp": 0.10437012, "routerloss_mlp": 0.0, "step": 2854, "time_per_iteration": 2.676128625869751 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048386, "balance_loss_mlp": 1.04261672, "diversity_loss_mlp": 0.0, "epoch": 0.5492497114274721, "flos": 1590309987840.0, "grad_norm": 0.03710413532206065, "language_loss": 0.77089292, "learning_rate": 0.00044465908829350453, "loss": 0.78137678, "num_input_tokens_seen": 238578464, "router_z_loss_mlp": 0.05761719, "routerloss_mlp": 0.0, "step": 2855, "time_per_iteration": 4.937518835067749 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076411, "balance_loss_mlp": 1.06609333, "diversity_loss_mlp": 0.0, "epoch": 0.5494420931127356, "flos": 750906754560.0, "grad_norm": 0.06582649113696544, "language_loss": 0.81989098, "learning_rate": 0.0004443494708958217, "loss": 0.83065504, "num_input_tokens_seen": 238660256, "router_z_loss_mlp": 0.10321045, "routerloss_mlp": 0.0, "step": 2856, "time_per_iteration": 2.9764318466186523 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077652, "balance_loss_mlp": 1.06707263, "diversity_loss_mlp": 0.0, "epoch": 0.5496344747979992, "flos": 626023904256.0, "grad_norm": 0.05962775351044122, "language_loss": 0.80705082, "learning_rate": 0.0004440398751035906, "loss": 0.81782728, "num_input_tokens_seen": 238745856, "router_z_loss_mlp": 0.10583496, "routerloss_mlp": 0.0, "step": 2857, "time_per_iteration": 2.8708760738372803 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107925, "balance_loss_mlp": 1.06846118, "diversity_loss_mlp": 0.0, "epoch": 0.5498268564832628, "flos": 523111924224.0, "grad_norm": 0.08652259855452149, "language_loss": 0.83723986, "learning_rate": 0.00044373030103700645, "loss": 0.84803236, "num_input_tokens_seen": 238813888, "router_z_loss_mlp": 0.10791016, "routerloss_mlp": 0.0, "step": 2858, "time_per_iteration": 2.629887342453003 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00857386, "balance_loss_mlp": 1.47058845, "diversity_loss_mlp": 0.21831456, "epoch": 0.5500192381685264, "flos": 604587151872.0, "grad_norm": 0.03034959963101528, "language_loss": 0.79655832, "learning_rate": 0.000443420748816257, "loss": 0.80513215, "num_input_tokens_seen": 238885440, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01293462, "step": 2859, "time_per_iteration": 2.8473408222198486 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107821, "balance_loss_mlp": 1.06795764, "diversity_loss_mlp": 0.0, "epoch": 0.55021161985379, "flos": 520527264768.0, "grad_norm": 0.07076083110298415, "language_loss": 0.78692329, "learning_rate": 0.0004431112185615208, "loss": 0.79770535, "num_input_tokens_seen": 238960944, "router_z_loss_mlp": 0.10253906, "routerloss_mlp": 0.0, "step": 2860, "time_per_iteration": 2.751131534576416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082571, "balance_loss_mlp": 1.0721283, "diversity_loss_mlp": 0.0, "epoch": 0.5504040015390534, "flos": 489671806464.0, "grad_norm": 0.06396450124437818, "language_loss": 0.7993266, "learning_rate": 0.00044280171039296845, "loss": 0.81015229, "num_input_tokens_seen": 239030592, "router_z_loss_mlp": 0.10449219, "routerloss_mlp": 0.0, "step": 2861, "time_per_iteration": 2.606870651245117 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082002, "balance_loss_mlp": 1.0716126, "diversity_loss_mlp": 0.0, "epoch": 0.550596383224317, "flos": 575787377664.0, "grad_norm": 0.0734058146638898, "language_loss": 0.8832019, "learning_rate": 0.0004424922244307616, "loss": 0.89402187, "num_input_tokens_seen": 239097440, "router_z_loss_mlp": 0.10394287, "routerloss_mlp": 0.0, "step": 2862, "time_per_iteration": 2.728055477142334 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081346, "balance_loss_mlp": 1.07124305, "diversity_loss_mlp": 0.0, "epoch": 0.5507887649095806, "flos": 642445157376.0, "grad_norm": 0.08810368166009505, "language_loss": 0.82030249, "learning_rate": 0.00044218276079505315, "loss": 0.83111596, "num_input_tokens_seen": 239179872, "router_z_loss_mlp": 0.10101318, "routerloss_mlp": 0.0, "step": 2863, "time_per_iteration": 2.8925743103027344 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076982, "balance_loss_mlp": 1.0667721, "diversity_loss_mlp": 0.0, "epoch": 0.5509811465948442, "flos": 531843812352.0, "grad_norm": 0.06918705117949257, "language_loss": 0.74817479, "learning_rate": 0.0004418733196059876, "loss": 0.75894463, "num_input_tokens_seen": 239251264, "router_z_loss_mlp": 0.10211182, "routerloss_mlp": 0.0, "step": 2864, "time_per_iteration": 2.747131109237671 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068041, "balance_loss_mlp": 1.0579797, "diversity_loss_mlp": 0.0, "epoch": 0.5511735282801077, "flos": 654747328512.0, "grad_norm": 0.060188467246496694, "language_loss": 0.79747194, "learning_rate": 0.0004415639009837008, "loss": 0.80815232, "num_input_tokens_seen": 239326688, "router_z_loss_mlp": 0.10058594, "routerloss_mlp": 0.0, "step": 2865, "time_per_iteration": 2.838609218597412 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077074, "balance_loss_mlp": 1.06704867, "diversity_loss_mlp": 0.0, "epoch": 0.5513659099653713, "flos": 529498861056.0, "grad_norm": 0.06869441498871262, "language_loss": 0.82126647, "learning_rate": 0.00044125450504831955, "loss": 0.83203721, "num_input_tokens_seen": 239401248, "router_z_loss_mlp": 0.10021973, "routerloss_mlp": 0.0, "step": 2866, "time_per_iteration": 2.7267115116119385 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080685, "balance_loss_mlp": 1.07046294, "diversity_loss_mlp": 0.0, "epoch": 0.5515582916506349, "flos": 554869315584.0, "grad_norm": 0.0812577822304444, "language_loss": 0.82503623, "learning_rate": 0.0004409451319199622, "loss": 0.83584309, "num_input_tokens_seen": 239471600, "router_z_loss_mlp": 0.10223389, "routerloss_mlp": 0.0, "step": 2867, "time_per_iteration": 2.6727194786071777 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080071, "balance_loss_mlp": 1.07005203, "diversity_loss_mlp": 0.0, "epoch": 0.5517506733358984, "flos": 735407258112.0, "grad_norm": 0.07302760882162292, "language_loss": 0.84415638, "learning_rate": 0.0004406357817187381, "loss": 0.8549571, "num_input_tokens_seen": 239548592, "router_z_loss_mlp": 0.10021973, "routerloss_mlp": 0.0, "step": 2868, "time_per_iteration": 2.9669716358184814 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084226, "balance_loss_mlp": 1.07424247, "diversity_loss_mlp": 0.0, "epoch": 0.551943055021162, "flos": 1115325697536.0, "grad_norm": 0.06120403113840053, "language_loss": 0.81250817, "learning_rate": 0.0004403264545647474, "loss": 0.82335043, "num_input_tokens_seen": 239644432, "router_z_loss_mlp": 0.09979248, "routerloss_mlp": 0.0, "step": 2869, "time_per_iteration": 3.535280704498291 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092656, "balance_loss_mlp": 1.08244562, "diversity_loss_mlp": 0.0, "epoch": 0.5521354367064255, "flos": 544373208576.0, "grad_norm": 0.05305368525165607, "language_loss": 0.84751379, "learning_rate": 0.00044001715057808154, "loss": 0.85844034, "num_input_tokens_seen": 239723392, "router_z_loss_mlp": 0.10211182, "routerloss_mlp": 0.0, "step": 2870, "time_per_iteration": 2.757197618484497 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00867753, "balance_loss_mlp": 1.49414647, "diversity_loss_mlp": 0.21602358, "epoch": 0.5523278183916891, "flos": 936285101568.0, "grad_norm": 0.02933333976418528, "language_loss": 0.81627762, "learning_rate": 0.0004397078698788232, "loss": 0.82495517, "num_input_tokens_seen": 239806896, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01266836, "step": 2871, "time_per_iteration": 3.241936445236206 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046288, "balance_loss_mlp": 1.04097104, "diversity_loss_mlp": 0.0, "epoch": 0.5525202000769527, "flos": 1465911696384.0, "grad_norm": 0.0256992480173019, "language_loss": 0.80442369, "learning_rate": 0.0004393986125870456, "loss": 0.81488657, "num_input_tokens_seen": 240037824, "router_z_loss_mlp": 0.05322266, "routerloss_mlp": 0.0, "step": 2872, "time_per_iteration": 4.879035234451294 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103168, "balance_loss_mlp": 1.09304726, "diversity_loss_mlp": 0.0, "epoch": 0.5527125817622163, "flos": 489800286720.0, "grad_norm": 0.06889966135830194, "language_loss": 0.78025937, "learning_rate": 0.00043908937882281343, "loss": 0.79129106, "num_input_tokens_seen": 240107952, "router_z_loss_mlp": 0.10119629, "routerloss_mlp": 0.0, "step": 2873, "time_per_iteration": 2.624072313308716 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097291, "balance_loss_mlp": 1.08644319, "diversity_loss_mlp": 0.0, "epoch": 0.5529049634474797, "flos": 634914008064.0, "grad_norm": 0.06659644406743612, "language_loss": 0.82492054, "learning_rate": 0.0004387801687061814, "loss": 0.83589351, "num_input_tokens_seen": 240183824, "router_z_loss_mlp": 0.10858154, "routerloss_mlp": 0.0, "step": 2874, "time_per_iteration": 2.839524269104004 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100748, "balance_loss_mlp": 1.09040689, "diversity_loss_mlp": 0.0, "epoch": 0.5530973451327433, "flos": 581274952704.0, "grad_norm": 0.06411004123803754, "language_loss": 0.80204833, "learning_rate": 0.0004384709823571958, "loss": 0.81305587, "num_input_tokens_seen": 240259296, "router_z_loss_mlp": 0.10345459, "routerloss_mlp": 0.0, "step": 2875, "time_per_iteration": 2.768268346786499 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092947, "balance_loss_mlp": 1.08278441, "diversity_loss_mlp": 0.0, "epoch": 0.5532897268180069, "flos": 1122488658432.0, "grad_norm": 0.0827933156096061, "language_loss": 0.83099473, "learning_rate": 0.0004381618198958932, "loss": 0.84192419, "num_input_tokens_seen": 240346768, "router_z_loss_mlp": 0.10162354, "routerloss_mlp": 0.0, "step": 2876, "time_per_iteration": 3.509364604949951 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084381, "balance_loss_mlp": 1.07393849, "diversity_loss_mlp": 0.0, "epoch": 0.5534821085032705, "flos": 637273640448.0, "grad_norm": 0.0672046455921574, "language_loss": 0.83616996, "learning_rate": 0.00043785268144230137, "loss": 0.84701377, "num_input_tokens_seen": 240429344, "router_z_loss_mlp": 0.10449219, "routerloss_mlp": 0.0, "step": 2877, "time_per_iteration": 2.8941080570220947 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078858, "balance_loss_mlp": 1.06849325, "diversity_loss_mlp": 0.0, "epoch": 0.5536744901885341, "flos": 571112529408.0, "grad_norm": 0.08466064144544548, "language_loss": 0.82657743, "learning_rate": 0.00043754356711643837, "loss": 0.83736604, "num_input_tokens_seen": 240497008, "router_z_loss_mlp": 0.10369873, "routerloss_mlp": 0.0, "step": 2878, "time_per_iteration": 2.6849513053894043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072509, "balance_loss_mlp": 1.0620904, "diversity_loss_mlp": 0.0, "epoch": 0.5538668718737976, "flos": 595716871680.0, "grad_norm": 0.08115939494621484, "language_loss": 0.84283209, "learning_rate": 0.0004372344770383132, "loss": 0.85355723, "num_input_tokens_seen": 240578432, "router_z_loss_mlp": 0.10424805, "routerloss_mlp": 0.0, "step": 2879, "time_per_iteration": 2.809833526611328 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064473, "balance_loss_mlp": 1.05426884, "diversity_loss_mlp": 0.0, "epoch": 0.5540592535590612, "flos": 532602210816.0, "grad_norm": 0.15468249092113104, "language_loss": 0.82951438, "learning_rate": 0.00043692541132792507, "loss": 0.84015906, "num_input_tokens_seen": 240649136, "router_z_loss_mlp": 0.10205078, "routerloss_mlp": 0.0, "step": 2880, "time_per_iteration": 2.6886332035064697 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106573, "balance_loss_mlp": 1.05541205, "diversity_loss_mlp": 0.0, "epoch": 0.5542516352443247, "flos": 412619235840.0, "grad_norm": 0.07258014540865806, "language_loss": 0.83396262, "learning_rate": 0.00043661637010526384, "loss": 0.84461993, "num_input_tokens_seen": 240714240, "router_z_loss_mlp": 0.10314941, "routerloss_mlp": 0.0, "step": 2881, "time_per_iteration": 2.484912872314453 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010637, "balance_loss_mlp": 1.05335283, "diversity_loss_mlp": 0.0, "epoch": 0.5544440169295883, "flos": 547607609856.0, "grad_norm": 0.07022154553173111, "language_loss": 0.83217472, "learning_rate": 0.00043630735349031025, "loss": 0.8428117, "num_input_tokens_seen": 240786928, "router_z_loss_mlp": 0.10351562, "routerloss_mlp": 0.0, "step": 2882, "time_per_iteration": 2.627950429916382 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064657, "balance_loss_mlp": 1.05427396, "diversity_loss_mlp": 0.0, "epoch": 0.5546363986148518, "flos": 621821131776.0, "grad_norm": 0.05734398116556458, "language_loss": 0.81837022, "learning_rate": 0.00043599836160303495, "loss": 0.8290168, "num_input_tokens_seen": 240865328, "router_z_loss_mlp": 0.10388184, "routerloss_mlp": 0.0, "step": 2883, "time_per_iteration": 2.87358021736145 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061705, "balance_loss_mlp": 1.05094647, "diversity_loss_mlp": 0.0, "epoch": 0.5548287803001154, "flos": 705292945920.0, "grad_norm": 0.05952583825506871, "language_loss": 0.77472365, "learning_rate": 0.0004356893945633995, "loss": 0.78534073, "num_input_tokens_seen": 240945680, "router_z_loss_mlp": 0.10760498, "routerloss_mlp": 0.0, "step": 2884, "time_per_iteration": 2.9415786266326904 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058414, "balance_loss_mlp": 1.04738104, "diversity_loss_mlp": 0.0, "epoch": 0.555021161985379, "flos": 504197789184.0, "grad_norm": 0.06387157363580499, "language_loss": 0.81997669, "learning_rate": 0.0004353804524913551, "loss": 0.8305608, "num_input_tokens_seen": 241010800, "router_z_loss_mlp": 0.11035156, "routerloss_mlp": 0.0, "step": 2885, "time_per_iteration": 2.5772132873535156 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106399, "balance_loss_mlp": 1.05298674, "diversity_loss_mlp": 0.0, "epoch": 0.5552135436706426, "flos": 616066684416.0, "grad_norm": 0.07314612024272811, "language_loss": 0.82015049, "learning_rate": 0.0004350715355068441, "loss": 0.8307904, "num_input_tokens_seen": 241085328, "router_z_loss_mlp": 0.11010742, "routerloss_mlp": 0.0, "step": 2886, "time_per_iteration": 2.7211849689483643 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062969, "balance_loss_mlp": 1.05221653, "diversity_loss_mlp": 0.0, "epoch": 0.5554059253559062, "flos": 463871494656.0, "grad_norm": 0.08671001380075964, "language_loss": 0.79774809, "learning_rate": 0.00043476264372979847, "loss": 0.8083778, "num_input_tokens_seen": 241149600, "router_z_loss_mlp": 0.10754395, "routerloss_mlp": 0.0, "step": 2887, "time_per_iteration": 2.5452206134796143 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064606, "balance_loss_mlp": 1.05403173, "diversity_loss_mlp": 0.0, "epoch": 0.5555983070411696, "flos": 1562512384512.0, "grad_norm": 0.08125450311694367, "language_loss": 0.78590369, "learning_rate": 0.0004344537772801408, "loss": 0.79654968, "num_input_tokens_seen": 241244832, "router_z_loss_mlp": 0.10577393, "routerloss_mlp": 0.0, "step": 2888, "time_per_iteration": 3.870267391204834 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01033708, "balance_loss_mlp": 1.02839172, "diversity_loss_mlp": 0.0, "epoch": 0.5557906887264332, "flos": 1467917821440.0, "grad_norm": 0.026917818165577125, "language_loss": 0.73422456, "learning_rate": 0.0004341449362777836, "loss": 0.74456155, "num_input_tokens_seen": 241479728, "router_z_loss_mlp": 0.05322266, "routerloss_mlp": 0.0, "step": 2889, "time_per_iteration": 4.943026065826416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091351, "balance_loss_mlp": 1.08043766, "diversity_loss_mlp": 0.0, "epoch": 0.5559830704116968, "flos": 529832544768.0, "grad_norm": 0.07456412824125162, "language_loss": 0.83536172, "learning_rate": 0.0004338361208426298, "loss": 0.84627521, "num_input_tokens_seen": 241545616, "router_z_loss_mlp": 0.10919189, "routerloss_mlp": 0.0, "step": 2890, "time_per_iteration": 2.65266752243042 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094404, "balance_loss_mlp": 1.08348465, "diversity_loss_mlp": 0.0, "epoch": 0.5561754520969604, "flos": 651218890752.0, "grad_norm": 0.057576040721241756, "language_loss": 0.81499392, "learning_rate": 0.00043352733109457164, "loss": 0.82593793, "num_input_tokens_seen": 241629040, "router_z_loss_mlp": 0.10919189, "routerloss_mlp": 0.0, "step": 2891, "time_per_iteration": 2.927246332168579 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106638, "balance_loss_mlp": 1.09556401, "diversity_loss_mlp": 0.0, "epoch": 0.556367833782224, "flos": 734297923584.0, "grad_norm": 0.0763949134442708, "language_loss": 0.84462321, "learning_rate": 0.00043321856715349244, "loss": 0.85568959, "num_input_tokens_seen": 241706272, "router_z_loss_mlp": 0.11077881, "routerloss_mlp": 0.0, "step": 2892, "time_per_iteration": 2.970857858657837 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110278, "balance_loss_mlp": 1.0918721, "diversity_loss_mlp": 0.0, "epoch": 0.5565602154674875, "flos": 672423648768.0, "grad_norm": 0.07453927070697552, "language_loss": 0.80594504, "learning_rate": 0.00043290982913926466, "loss": 0.81697285, "num_input_tokens_seen": 241782304, "router_z_loss_mlp": 0.10913086, "routerloss_mlp": 0.0, "step": 2893, "time_per_iteration": 2.8581972122192383 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01105658, "balance_loss_mlp": 1.09473801, "diversity_loss_mlp": 0.0, "epoch": 0.556752597152751, "flos": 586228783104.0, "grad_norm": 0.08476057735977802, "language_loss": 0.84177083, "learning_rate": 0.0004326011171717514, "loss": 0.85282743, "num_input_tokens_seen": 241868576, "router_z_loss_mlp": 0.109375, "routerloss_mlp": 0.0, "step": 2894, "time_per_iteration": 2.90563702583313 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094642, "balance_loss_mlp": 1.08371019, "diversity_loss_mlp": 0.0, "epoch": 0.5569449788380146, "flos": 437777146368.0, "grad_norm": 0.06785531665857511, "language_loss": 0.80468631, "learning_rate": 0.0004322924313708051, "loss": 0.8156327, "num_input_tokens_seen": 241933696, "router_z_loss_mlp": 0.10931396, "routerloss_mlp": 0.0, "step": 2895, "time_per_iteration": 2.51784610748291 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092855, "balance_loss_mlp": 1.08219218, "diversity_loss_mlp": 0.0, "epoch": 0.5571373605232782, "flos": 502250761728.0, "grad_norm": 0.07706946900287333, "language_loss": 0.84533763, "learning_rate": 0.0004319837718562681, "loss": 0.85626626, "num_input_tokens_seen": 242003056, "router_z_loss_mlp": 0.10668945, "routerloss_mlp": 0.0, "step": 2896, "time_per_iteration": 2.5862512588500977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083747, "balance_loss_mlp": 1.07321525, "diversity_loss_mlp": 0.0, "epoch": 0.5573297422085417, "flos": 577417973760.0, "grad_norm": 0.0793708179068888, "language_loss": 0.83050567, "learning_rate": 0.0004316751387479726, "loss": 0.84134316, "num_input_tokens_seen": 242076368, "router_z_loss_mlp": 0.10534668, "routerloss_mlp": 0.0, "step": 2897, "time_per_iteration": 2.778136730194092 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00857516, "balance_loss_mlp": 1.47219694, "diversity_loss_mlp": 0.21748725, "epoch": 0.5575221238938053, "flos": 1344037515264.0, "grad_norm": 0.034004819690404205, "language_loss": 0.82499564, "learning_rate": 0.0004313665321657409, "loss": 0.83357084, "num_input_tokens_seen": 242161600, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01267361, "step": 2898, "time_per_iteration": 3.7754030227661133 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078665, "balance_loss_mlp": 1.06795418, "diversity_loss_mlp": 0.0, "epoch": 0.5577145055790689, "flos": 601963218432.0, "grad_norm": 0.08236969633510602, "language_loss": 0.79824448, "learning_rate": 0.00043105795222938436, "loss": 0.80903113, "num_input_tokens_seen": 242237904, "router_z_loss_mlp": 0.1071167, "routerloss_mlp": 0.0, "step": 2899, "time_per_iteration": 2.7090694904327393 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073397, "balance_loss_mlp": 1.06296027, "diversity_loss_mlp": 0.0, "epoch": 0.5579068872643325, "flos": 562620349440.0, "grad_norm": 0.07659548301877016, "language_loss": 0.78690445, "learning_rate": 0.00043074939905870467, "loss": 0.79763848, "num_input_tokens_seen": 242306736, "router_z_loss_mlp": 0.10443115, "routerloss_mlp": 0.0, "step": 2900, "time_per_iteration": 2.6444900035858154 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069144, "balance_loss_mlp": 1.05899358, "diversity_loss_mlp": 0.0, "epoch": 0.558099268949596, "flos": 544551247872.0, "grad_norm": 0.08372730008806528, "language_loss": 0.80284113, "learning_rate": 0.0004304408727734927, "loss": 0.81353253, "num_input_tokens_seen": 242376000, "router_z_loss_mlp": 0.10150146, "routerloss_mlp": 0.0, "step": 2901, "time_per_iteration": 2.6800661087036133 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00855039, "balance_loss_mlp": 1.46478724, "diversity_loss_mlp": 0.21833366, "epoch": 0.5582916506348595, "flos": 552786467328.0, "grad_norm": 0.026106559121528438, "language_loss": 0.88945115, "learning_rate": 0.0004301323734935288, "loss": 0.89800155, "num_input_tokens_seen": 242447056, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01347797, "step": 2902, "time_per_iteration": 2.6880388259887695 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106479, "balance_loss_mlp": 1.05446076, "diversity_loss_mlp": 0.0, "epoch": 0.5584840323201231, "flos": 543385013760.0, "grad_norm": 0.08715674624995783, "language_loss": 0.87386537, "learning_rate": 0.000429823901338583, "loss": 0.88451326, "num_input_tokens_seen": 242514400, "router_z_loss_mlp": 0.10333252, "routerloss_mlp": 0.0, "step": 2903, "time_per_iteration": 2.611330032348633 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070577, "balance_loss_mlp": 1.06004524, "diversity_loss_mlp": 0.0, "epoch": 0.5586764140053867, "flos": 815573090304.0, "grad_norm": 0.07350666628476007, "language_loss": 0.86772639, "learning_rate": 0.00042951545642841513, "loss": 0.87843215, "num_input_tokens_seen": 242601616, "router_z_loss_mlp": 0.10534668, "routerloss_mlp": 0.0, "step": 2904, "time_per_iteration": 3.066653251647949 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078659, "balance_loss_mlp": 1.06802535, "diversity_loss_mlp": 0.0, "epoch": 0.5588687956906503, "flos": 486439976448.0, "grad_norm": 0.06907930895976065, "language_loss": 0.86694556, "learning_rate": 0.0004292070388827737, "loss": 0.87773216, "num_input_tokens_seen": 242669648, "router_z_loss_mlp": 0.10644531, "routerloss_mlp": 0.0, "step": 2905, "time_per_iteration": 2.5430614948272705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068326, "balance_loss_mlp": 1.05785918, "diversity_loss_mlp": 0.0, "epoch": 0.5590611773759138, "flos": 452060849664.0, "grad_norm": 0.06877653703862108, "language_loss": 0.81346464, "learning_rate": 0.00042889864882139753, "loss": 0.82414794, "num_input_tokens_seen": 242737456, "router_z_loss_mlp": 0.10473633, "routerloss_mlp": 0.0, "step": 2906, "time_per_iteration": 2.5722434520721436 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075755, "balance_loss_mlp": 1.06534863, "diversity_loss_mlp": 0.0, "epoch": 0.5592535590611774, "flos": 520945012224.0, "grad_norm": 0.06732553967994827, "language_loss": 0.81503737, "learning_rate": 0.0004285902863640139, "loss": 0.82579494, "num_input_tokens_seen": 242807008, "router_z_loss_mlp": 0.10406494, "routerloss_mlp": 0.0, "step": 2907, "time_per_iteration": 2.643721580505371 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074648, "balance_loss_mlp": 1.06431222, "diversity_loss_mlp": 0.0, "epoch": 0.5594459407464409, "flos": 552519595008.0, "grad_norm": 0.06943407338412115, "language_loss": 0.86278725, "learning_rate": 0.00042828195163033966, "loss": 0.87353367, "num_input_tokens_seen": 242877328, "router_z_loss_mlp": 0.10339355, "routerloss_mlp": 0.0, "step": 2908, "time_per_iteration": 2.7045791149139404 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081822, "balance_loss_mlp": 1.07135582, "diversity_loss_mlp": 0.0, "epoch": 0.5596383224317045, "flos": 484833973248.0, "grad_norm": 0.07324820072157985, "language_loss": 0.79102659, "learning_rate": 0.0004279736447400812, "loss": 0.80184484, "num_input_tokens_seen": 242943152, "router_z_loss_mlp": 0.10473633, "routerloss_mlp": 0.0, "step": 2909, "time_per_iteration": 2.585176944732666 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107588, "balance_loss_mlp": 1.06558049, "diversity_loss_mlp": 0.0, "epoch": 0.5598307041169681, "flos": 611256015360.0, "grad_norm": 0.07142642262643135, "language_loss": 0.78468478, "learning_rate": 0.00042766536581293385, "loss": 0.79544365, "num_input_tokens_seen": 243014656, "router_z_loss_mlp": 0.10302734, "routerloss_mlp": 0.0, "step": 2910, "time_per_iteration": 2.723602771759033 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090371, "balance_loss_mlp": 1.07975566, "diversity_loss_mlp": 0.0, "epoch": 0.5600230858022316, "flos": 488851365888.0, "grad_norm": 0.0702995437532307, "language_loss": 0.79552364, "learning_rate": 0.0004273571149685819, "loss": 0.80642736, "num_input_tokens_seen": 243089040, "router_z_loss_mlp": 0.10620117, "routerloss_mlp": 0.0, "step": 2911, "time_per_iteration": 2.7220258712768555 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091791, "balance_loss_mlp": 1.08147311, "diversity_loss_mlp": 0.0, "epoch": 0.5602154674874952, "flos": 598869780480.0, "grad_norm": 0.06270923487878967, "language_loss": 0.84021366, "learning_rate": 0.00042704889232669937, "loss": 0.85113156, "num_input_tokens_seen": 243162480, "router_z_loss_mlp": 0.10321045, "routerloss_mlp": 0.0, "step": 2912, "time_per_iteration": 2.6799380779266357 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00848913, "balance_loss_mlp": 1.45588994, "diversity_loss_mlp": 0.21708892, "epoch": 0.5604078491727588, "flos": 585969624576.0, "grad_norm": 0.03254511626684893, "language_loss": 0.85648382, "learning_rate": 0.0004267406980069484, "loss": 0.86497295, "num_input_tokens_seen": 243232880, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01242387, "step": 2913, "time_per_iteration": 2.7309391498565674 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0111244, "balance_loss_mlp": 1.10193157, "diversity_loss_mlp": 0.0, "epoch": 0.5606002308580224, "flos": 541205618688.0, "grad_norm": 0.05402445789476675, "language_loss": 0.79744071, "learning_rate": 0.0004264325321289808, "loss": 0.80856508, "num_input_tokens_seen": 243309168, "router_z_loss_mlp": 0.10510254, "routerloss_mlp": 0.0, "step": 2914, "time_per_iteration": 2.8245773315429688 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01104335, "balance_loss_mlp": 1.09404707, "diversity_loss_mlp": 0.0, "epoch": 0.5607926125432858, "flos": 583938533376.0, "grad_norm": 0.07588418732744176, "language_loss": 0.86308336, "learning_rate": 0.00042612439481243736, "loss": 0.87412667, "num_input_tokens_seen": 243382064, "router_z_loss_mlp": 0.10284424, "routerloss_mlp": 0.0, "step": 2915, "time_per_iteration": 2.7910971641540527 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01109566, "balance_loss_mlp": 1.09916496, "diversity_loss_mlp": 0.0, "epoch": 0.5609849942285494, "flos": 627489317376.0, "grad_norm": 0.07165476469353879, "language_loss": 0.90284097, "learning_rate": 0.00042581628617694735, "loss": 0.91393661, "num_input_tokens_seen": 243452064, "router_z_loss_mlp": 0.10412598, "routerloss_mlp": 0.0, "step": 2916, "time_per_iteration": 2.7449898719787598 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00839442, "balance_loss_mlp": 1.43753612, "diversity_loss_mlp": 0.21687999, "epoch": 0.561177375913813, "flos": 588366332928.0, "grad_norm": 0.03331291255724556, "language_loss": 0.81856477, "learning_rate": 0.0004255082063421296, "loss": 0.82695925, "num_input_tokens_seen": 243525600, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01223436, "step": 2917, "time_per_iteration": 2.705263614654541 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01131558, "balance_loss_mlp": 1.12130046, "diversity_loss_mlp": 0.0, "epoch": 0.5613697575990766, "flos": 527047824384.0, "grad_norm": 0.07697799391889214, "language_loss": 0.84842837, "learning_rate": 0.00042520015542759065, "loss": 0.85974395, "num_input_tokens_seen": 243605536, "router_z_loss_mlp": 0.10253906, "routerloss_mlp": 0.0, "step": 2918, "time_per_iteration": 2.8643360137939453 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110893, "balance_loss_mlp": 1.09857666, "diversity_loss_mlp": 0.0, "epoch": 0.5615621392843402, "flos": 642655130112.0, "grad_norm": 0.059259650717302215, "language_loss": 0.88182557, "learning_rate": 0.00042489213355292687, "loss": 0.89291489, "num_input_tokens_seen": 243684208, "router_z_loss_mlp": 0.10357666, "routerloss_mlp": 0.0, "step": 2919, "time_per_iteration": 2.871605634689331 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01113923, "balance_loss_mlp": 1.1035037, "diversity_loss_mlp": 0.0, "epoch": 0.5617545209696037, "flos": 427750543872.0, "grad_norm": 0.07025137955977834, "language_loss": 0.81129396, "learning_rate": 0.00042458414083772276, "loss": 0.82243323, "num_input_tokens_seen": 243749376, "router_z_loss_mlp": 0.10424805, "routerloss_mlp": 0.0, "step": 2920, "time_per_iteration": 2.5280137062072754 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110383, "balance_loss_mlp": 1.09353638, "diversity_loss_mlp": 0.0, "epoch": 0.5619469026548672, "flos": 568429125120.0, "grad_norm": 0.06291310679725345, "language_loss": 0.85259616, "learning_rate": 0.000424276177401552, "loss": 0.86363447, "num_input_tokens_seen": 243828096, "router_z_loss_mlp": 0.10296631, "routerloss_mlp": 0.0, "step": 2921, "time_per_iteration": 2.8061861991882324 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091107, "balance_loss_mlp": 1.08052063, "diversity_loss_mlp": 0.0, "epoch": 0.5621392843401308, "flos": 505205807616.0, "grad_norm": 0.06947728514830868, "language_loss": 0.8586399, "learning_rate": 0.0004239682433639763, "loss": 0.86955094, "num_input_tokens_seen": 243896752, "router_z_loss_mlp": 0.10583496, "routerloss_mlp": 0.0, "step": 2922, "time_per_iteration": 2.7068192958831787 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087689, "balance_loss_mlp": 1.07726383, "diversity_loss_mlp": 0.0, "epoch": 0.5623316660253944, "flos": 516996628992.0, "grad_norm": 0.06724553342566655, "language_loss": 0.85617495, "learning_rate": 0.0004236603388445467, "loss": 0.86705184, "num_input_tokens_seen": 243964592, "router_z_loss_mlp": 0.10418701, "routerloss_mlp": 0.0, "step": 2923, "time_per_iteration": 2.5658164024353027 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083423, "balance_loss_mlp": 1.07329023, "diversity_loss_mlp": 0.0, "epoch": 0.5625240477106579, "flos": 606012917760.0, "grad_norm": 0.06491959150956746, "language_loss": 0.82087809, "learning_rate": 0.00042335246396280166, "loss": 0.83171237, "num_input_tokens_seen": 244036656, "router_z_loss_mlp": 0.10131836, "routerloss_mlp": 0.0, "step": 2924, "time_per_iteration": 2.7210686206817627 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076248, "balance_loss_mlp": 1.06606197, "diversity_loss_mlp": 0.0, "epoch": 0.5627164293959215, "flos": 450430253568.0, "grad_norm": 0.06924351044147684, "language_loss": 0.90442908, "learning_rate": 0.0004230446188382693, "loss": 0.91519153, "num_input_tokens_seen": 244102704, "router_z_loss_mlp": 0.10186768, "routerloss_mlp": 0.0, "step": 2925, "time_per_iteration": 2.5210559368133545 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072823, "balance_loss_mlp": 1.06237423, "diversity_loss_mlp": 0.0, "epoch": 0.5629088110811851, "flos": 742073550336.0, "grad_norm": 0.06189914516088338, "language_loss": 0.80191588, "learning_rate": 0.0004227368035904654, "loss": 0.81264406, "num_input_tokens_seen": 244186640, "router_z_loss_mlp": 0.10455322, "routerloss_mlp": 0.0, "step": 2926, "time_per_iteration": 2.957545757293701 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073658, "balance_loss_mlp": 1.06312013, "diversity_loss_mlp": 0.0, "epoch": 0.5631011927664487, "flos": 496970588160.0, "grad_norm": 0.07119677802103677, "language_loss": 0.8312782, "learning_rate": 0.00042242901833889474, "loss": 0.84201479, "num_input_tokens_seen": 244257680, "router_z_loss_mlp": 0.10540771, "routerloss_mlp": 0.0, "step": 2927, "time_per_iteration": 2.6197497844696045 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069584, "balance_loss_mlp": 1.05933261, "diversity_loss_mlp": 0.0, "epoch": 0.5632935744517122, "flos": 886137408000.0, "grad_norm": 0.07548469953325632, "language_loss": 0.85944557, "learning_rate": 0.0004221212632030501, "loss": 0.87014145, "num_input_tokens_seen": 244331248, "router_z_loss_mlp": 0.10253906, "routerloss_mlp": 0.0, "step": 2928, "time_per_iteration": 3.0718417167663574 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074032, "balance_loss_mlp": 1.0636375, "diversity_loss_mlp": 0.0, "epoch": 0.5634859561369757, "flos": 604792355328.0, "grad_norm": 0.0702405954135719, "language_loss": 0.8005904, "learning_rate": 0.0004218135383024124, "loss": 0.81133074, "num_input_tokens_seen": 244403920, "router_z_loss_mlp": 0.10394287, "routerloss_mlp": 0.0, "step": 2929, "time_per_iteration": 2.6883885860443115 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068804, "balance_loss_mlp": 1.05836129, "diversity_loss_mlp": 0.0, "epoch": 0.5636783378222393, "flos": 453916472832.0, "grad_norm": 0.07423933793606223, "language_loss": 0.85405028, "learning_rate": 0.0004215058437564511, "loss": 0.86473835, "num_input_tokens_seen": 244470464, "router_z_loss_mlp": 0.10443115, "routerloss_mlp": 0.0, "step": 2930, "time_per_iteration": 2.5645458698272705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075733, "balance_loss_mlp": 1.06520677, "diversity_loss_mlp": 0.0, "epoch": 0.5638707195075029, "flos": 518456899584.0, "grad_norm": 0.07045402067927274, "language_loss": 0.82365847, "learning_rate": 0.00042119817968462397, "loss": 0.83441579, "num_input_tokens_seen": 244536864, "router_z_loss_mlp": 0.10528564, "routerloss_mlp": 0.0, "step": 2931, "time_per_iteration": 2.596431255340576 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00843243, "balance_loss_mlp": 1.44432163, "diversity_loss_mlp": 0.21611315, "epoch": 0.5640631011927665, "flos": 564873896448.0, "grad_norm": 0.034099962370994746, "language_loss": 0.87154222, "learning_rate": 0.0004208905462063766, "loss": 0.8799746, "num_input_tokens_seen": 244603344, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01302544, "step": 2932, "time_per_iteration": 2.7103724479675293 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088146, "balance_loss_mlp": 1.07760167, "diversity_loss_mlp": 0.0, "epoch": 0.56425548287803, "flos": 517033704960.0, "grad_norm": 0.07257480225633914, "language_loss": 0.84035242, "learning_rate": 0.00042058294344114315, "loss": 0.8512339, "num_input_tokens_seen": 244671984, "router_z_loss_mlp": 0.10546875, "routerloss_mlp": 0.0, "step": 2933, "time_per_iteration": 2.6817541122436523 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00846618, "balance_loss_mlp": 1.45035362, "diversity_loss_mlp": 0.21710092, "epoch": 0.5644478645632935, "flos": 854258876928.0, "grad_norm": 0.03239193802507573, "language_loss": 0.77597153, "learning_rate": 0.0004202753715083456, "loss": 0.78443778, "num_input_tokens_seen": 244754000, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01289086, "step": 2934, "time_per_iteration": 3.1172194480895996 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097711, "balance_loss_mlp": 1.08684492, "diversity_loss_mlp": 0.0, "epoch": 0.5646402462485571, "flos": 553438780416.0, "grad_norm": 0.08960488369203884, "language_loss": 0.8126961, "learning_rate": 0.0004199678305273936, "loss": 0.82367325, "num_input_tokens_seen": 244820896, "router_z_loss_mlp": 0.10876465, "routerloss_mlp": 0.0, "step": 2935, "time_per_iteration": 2.648293972015381 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01096103, "balance_loss_mlp": 1.08564794, "diversity_loss_mlp": 0.0, "epoch": 0.5648326279338207, "flos": 685990798848.0, "grad_norm": 0.06584718006017456, "language_loss": 0.81395173, "learning_rate": 0.0004196603206176854, "loss": 0.82491279, "num_input_tokens_seen": 244904464, "router_z_loss_mlp": 0.10461426, "routerloss_mlp": 0.0, "step": 2936, "time_per_iteration": 2.9504921436309814 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110133, "balance_loss_mlp": 1.09094691, "diversity_loss_mlp": 0.0, "epoch": 0.5650250096190843, "flos": 803327818752.0, "grad_norm": 0.06854637503151859, "language_loss": 0.83705592, "learning_rate": 0.000419352841898607, "loss": 0.84806919, "num_input_tokens_seen": 244983760, "router_z_loss_mlp": 0.10388184, "routerloss_mlp": 0.0, "step": 2937, "time_per_iteration": 2.965176582336426 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100386, "balance_loss_mlp": 1.09003913, "diversity_loss_mlp": 0.0, "epoch": 0.5652173913043478, "flos": 582058317312.0, "grad_norm": 0.06908295336200668, "language_loss": 0.77684075, "learning_rate": 0.000419045394489532, "loss": 0.7878446, "num_input_tokens_seen": 245053184, "router_z_loss_mlp": 0.10345459, "routerloss_mlp": 0.0, "step": 2938, "time_per_iteration": 2.692997455596924 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094877, "balance_loss_mlp": 1.08429718, "diversity_loss_mlp": 0.0, "epoch": 0.5654097729896114, "flos": 820648060416.0, "grad_norm": 0.06508171061148607, "language_loss": 0.76831025, "learning_rate": 0.0004187379785098224, "loss": 0.77925897, "num_input_tokens_seen": 245137408, "router_z_loss_mlp": 0.10583496, "routerloss_mlp": 0.0, "step": 2939, "time_per_iteration": 3.123154401779175 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0110149, "balance_loss_mlp": 1.09110653, "diversity_loss_mlp": 0.0, "epoch": 0.565602154674875, "flos": 784156723200.0, "grad_norm": 0.08014464510269267, "language_loss": 0.83749938, "learning_rate": 0.00041843059407882744, "loss": 0.84851432, "num_input_tokens_seen": 245215504, "router_z_loss_mlp": 0.10388184, "routerloss_mlp": 0.0, "step": 2940, "time_per_iteration": 2.9720611572265625 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099107, "balance_loss_mlp": 1.0887475, "diversity_loss_mlp": 0.0, "epoch": 0.5657945363601385, "flos": 549683117568.0, "grad_norm": 0.06910210619422795, "language_loss": 0.82642627, "learning_rate": 0.0004181232413158842, "loss": 0.83741736, "num_input_tokens_seen": 245286032, "router_z_loss_mlp": 0.10357666, "routerloss_mlp": 0.0, "step": 2941, "time_per_iteration": 2.657360315322876 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094217, "balance_loss_mlp": 1.08388722, "diversity_loss_mlp": 0.0, "epoch": 0.5659869180454021, "flos": 668126900736.0, "grad_norm": 0.08913898875539945, "language_loss": 0.82192254, "learning_rate": 0.0004178159203403179, "loss": 0.83286464, "num_input_tokens_seen": 245359040, "router_z_loss_mlp": 0.10333252, "routerloss_mlp": 0.0, "step": 2942, "time_per_iteration": 2.8812596797943115 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080275, "balance_loss_mlp": 1.07014799, "diversity_loss_mlp": 0.0, "epoch": 0.5661792997306656, "flos": 499955369472.0, "grad_norm": 0.06202774017820852, "language_loss": 0.8130517, "learning_rate": 0.0004175086312714409, "loss": 0.82385445, "num_input_tokens_seen": 245426384, "router_z_loss_mlp": 0.10125732, "routerloss_mlp": 0.0, "step": 2943, "time_per_iteration": 2.561537027359009 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080645, "balance_loss_mlp": 1.07015431, "diversity_loss_mlp": 0.0, "epoch": 0.5663716814159292, "flos": 601209589248.0, "grad_norm": 0.05809127095966742, "language_loss": 0.83570457, "learning_rate": 0.00041720137422855366, "loss": 0.84651101, "num_input_tokens_seen": 245501216, "router_z_loss_mlp": 0.10491943, "routerloss_mlp": 0.0, "step": 2944, "time_per_iteration": 2.7395284175872803 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075472, "balance_loss_mlp": 1.06576228, "diversity_loss_mlp": 0.0, "epoch": 0.5665640631011928, "flos": 540988305408.0, "grad_norm": 0.07239714207057282, "language_loss": 0.79116005, "learning_rate": 0.00041689414933094383, "loss": 0.80191475, "num_input_tokens_seen": 245571600, "router_z_loss_mlp": 0.09698486, "routerloss_mlp": 0.0, "step": 2945, "time_per_iteration": 2.654930353164673 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067367, "balance_loss_mlp": 1.05734193, "diversity_loss_mlp": 0.0, "epoch": 0.5667564447864564, "flos": 601936054272.0, "grad_norm": 0.07615309090382201, "language_loss": 0.80823922, "learning_rate": 0.00041658695669788653, "loss": 0.81891298, "num_input_tokens_seen": 245645632, "router_z_loss_mlp": 0.10021973, "routerloss_mlp": 0.0, "step": 2946, "time_per_iteration": 2.747903347015381 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069298, "balance_loss_mlp": 1.05894506, "diversity_loss_mlp": 0.0, "epoch": 0.5669488264717198, "flos": 659523492864.0, "grad_norm": 0.09594015960064259, "language_loss": 0.81304628, "learning_rate": 0.00041627979644864453, "loss": 0.82373923, "num_input_tokens_seen": 245715776, "router_z_loss_mlp": 0.10357666, "routerloss_mlp": 0.0, "step": 2947, "time_per_iteration": 2.8192365169525146 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064628, "balance_loss_mlp": 1.05435264, "diversity_loss_mlp": 0.0, "epoch": 0.5671412081569834, "flos": 485402222592.0, "grad_norm": 0.06124486727819338, "language_loss": 0.81212783, "learning_rate": 0.0004159726687024683, "loss": 0.82277411, "num_input_tokens_seen": 245785328, "router_z_loss_mlp": 0.1027832, "routerloss_mlp": 0.0, "step": 2948, "time_per_iteration": 2.634019613265991 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066585, "balance_loss_mlp": 1.05610037, "diversity_loss_mlp": 0.0, "epoch": 0.567333589842247, "flos": 729801114624.0, "grad_norm": 0.0698899799050157, "language_loss": 0.7929486, "learning_rate": 0.00041566557357859506, "loss": 0.80361444, "num_input_tokens_seen": 245858000, "router_z_loss_mlp": 0.1048584, "routerloss_mlp": 0.0, "step": 2949, "time_per_iteration": 2.861374616622925 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068636, "balance_loss_mlp": 1.05816913, "diversity_loss_mlp": 0.0, "epoch": 0.5675259715275106, "flos": 968887526400.0, "grad_norm": 0.0603589352170923, "language_loss": 0.79605162, "learning_rate": 0.0004153585111962502, "loss": 0.80673802, "num_input_tokens_seen": 245950640, "router_z_loss_mlp": 0.10473633, "routerloss_mlp": 0.0, "step": 2950, "time_per_iteration": 3.3136749267578125 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076091, "balance_loss_mlp": 1.06528509, "diversity_loss_mlp": 0.0, "epoch": 0.5677183532127742, "flos": 565145538048.0, "grad_norm": 0.07046051490297799, "language_loss": 0.84271163, "learning_rate": 0.0004150514816746453, "loss": 0.85347259, "num_input_tokens_seen": 246019568, "router_z_loss_mlp": 0.10803223, "routerloss_mlp": 0.0, "step": 2951, "time_per_iteration": 2.7142550945281982 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079575, "balance_loss_mlp": 1.0689894, "diversity_loss_mlp": 0.0, "epoch": 0.5679107348980377, "flos": 551694385152.0, "grad_norm": 0.07561213643312675, "language_loss": 0.85564739, "learning_rate": 0.0004147444851329802, "loss": 0.8664431, "num_input_tokens_seen": 246089520, "router_z_loss_mlp": 0.105896, "routerloss_mlp": 0.0, "step": 2952, "time_per_iteration": 2.663442611694336 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079915, "balance_loss_mlp": 1.06943655, "diversity_loss_mlp": 0.0, "epoch": 0.5681031165833013, "flos": 819459804672.0, "grad_norm": 0.06334656392280237, "language_loss": 0.85917854, "learning_rate": 0.00041443752169044126, "loss": 0.86997765, "num_input_tokens_seen": 246165920, "router_z_loss_mlp": 0.1048584, "routerloss_mlp": 0.0, "step": 2953, "time_per_iteration": 3.0424787998199463 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083209, "balance_loss_mlp": 1.07296944, "diversity_loss_mlp": 0.0, "epoch": 0.5682954982685648, "flos": 618013711872.0, "grad_norm": 0.08759511227816434, "language_loss": 0.84844387, "learning_rate": 0.0004141305914662025, "loss": 0.85927594, "num_input_tokens_seen": 246238672, "router_z_loss_mlp": 0.10241699, "routerloss_mlp": 0.0, "step": 2954, "time_per_iteration": 2.720574378967285 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080604, "balance_loss_mlp": 1.06977344, "diversity_loss_mlp": 0.0, "epoch": 0.5684878799538284, "flos": 647949984768.0, "grad_norm": 0.0625505952609041, "language_loss": 0.80443704, "learning_rate": 0.0004138236945794246, "loss": 0.81524312, "num_input_tokens_seen": 246320208, "router_z_loss_mlp": 0.10839844, "routerloss_mlp": 0.0, "step": 2955, "time_per_iteration": 2.880007743835449 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067912, "balance_loss_mlp": 1.05775595, "diversity_loss_mlp": 0.0, "epoch": 0.5686802616390919, "flos": 805961664000.0, "grad_norm": 0.08164782403227437, "language_loss": 0.84066302, "learning_rate": 0.00041351683114925576, "loss": 0.85134214, "num_input_tokens_seen": 246406464, "router_z_loss_mlp": 0.1015625, "routerloss_mlp": 0.0, "step": 2956, "time_per_iteration": 3.061213731765747 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072205, "balance_loss_mlp": 1.06213737, "diversity_loss_mlp": 0.0, "epoch": 0.5688726433243555, "flos": 547140676608.0, "grad_norm": 0.06079019071224684, "language_loss": 0.86355555, "learning_rate": 0.0004132100012948308, "loss": 0.87427759, "num_input_tokens_seen": 246477456, "router_z_loss_mlp": 0.10064697, "routerloss_mlp": 0.0, "step": 2957, "time_per_iteration": 2.631786823272705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069231, "balance_loss_mlp": 1.0587523, "diversity_loss_mlp": 0.0, "epoch": 0.5690650250096191, "flos": 486568456704.0, "grad_norm": 0.07979265854660174, "language_loss": 0.84526646, "learning_rate": 0.00041290320513527145, "loss": 0.85595882, "num_input_tokens_seen": 246541744, "router_z_loss_mlp": 0.10473633, "routerloss_mlp": 0.0, "step": 2958, "time_per_iteration": 2.5593366622924805 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061477, "balance_loss_mlp": 1.05111814, "diversity_loss_mlp": 0.0, "epoch": 0.5692574066948827, "flos": 577457620992.0, "grad_norm": 0.09201222931646683, "language_loss": 0.85128796, "learning_rate": 0.0004125964427896867, "loss": 0.86190271, "num_input_tokens_seen": 246611440, "router_z_loss_mlp": 0.1036377, "routerloss_mlp": 0.0, "step": 2959, "time_per_iteration": 2.667381525039673 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063517, "balance_loss_mlp": 1.05320501, "diversity_loss_mlp": 0.0, "epoch": 0.5694497883801463, "flos": 454247585280.0, "grad_norm": 0.06922825543149586, "language_loss": 0.79212141, "learning_rate": 0.0004122897143771723, "loss": 0.80275661, "num_input_tokens_seen": 246676496, "router_z_loss_mlp": 0.10314941, "routerloss_mlp": 0.0, "step": 2960, "time_per_iteration": 2.523068904876709 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067248, "balance_loss_mlp": 1.0569005, "diversity_loss_mlp": 0.0, "epoch": 0.5696421700654097, "flos": 559516999680.0, "grad_norm": 0.06880331468011665, "language_loss": 0.81306094, "learning_rate": 0.0004119830200168109, "loss": 0.82373345, "num_input_tokens_seen": 246746464, "router_z_loss_mlp": 0.10351562, "routerloss_mlp": 0.0, "step": 2961, "time_per_iteration": 2.7224626541137695 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106382, "balance_loss_mlp": 1.05356169, "diversity_loss_mlp": 0.0, "epoch": 0.5698345517506733, "flos": 465551649792.0, "grad_norm": 0.08443053343043137, "language_loss": 0.88515878, "learning_rate": 0.0004116763598276714, "loss": 0.89579695, "num_input_tokens_seen": 246811808, "router_z_loss_mlp": 0.1026001, "routerloss_mlp": 0.0, "step": 2962, "time_per_iteration": 2.4910728931427 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067582, "balance_loss_mlp": 1.05738318, "diversity_loss_mlp": 0.0, "epoch": 0.5700269334359369, "flos": 605953446912.0, "grad_norm": 0.07427131552828858, "language_loss": 0.81298989, "learning_rate": 0.00041136973392881017, "loss": 0.82366574, "num_input_tokens_seen": 246890432, "router_z_loss_mlp": 0.10198975, "routerloss_mlp": 0.0, "step": 2963, "time_per_iteration": 2.8261218070983887 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063452, "balance_loss_mlp": 1.05275846, "diversity_loss_mlp": 0.0, "epoch": 0.5702193151212005, "flos": 562709182464.0, "grad_norm": 0.0795338566562928, "language_loss": 0.82039535, "learning_rate": 0.00041106314243926983, "loss": 0.83102989, "num_input_tokens_seen": 246959616, "router_z_loss_mlp": 0.10699463, "routerloss_mlp": 0.0, "step": 2964, "time_per_iteration": 2.7321033477783203 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058191, "balance_loss_mlp": 1.04802823, "diversity_loss_mlp": 0.0, "epoch": 0.570411696806464, "flos": 523247745024.0, "grad_norm": 0.07985594809339186, "language_loss": 0.87473917, "learning_rate": 0.0004107565854780798, "loss": 0.88532114, "num_input_tokens_seen": 247030656, "router_z_loss_mlp": 0.1015625, "routerloss_mlp": 0.0, "step": 2965, "time_per_iteration": 2.685188055038452 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105982, "balance_loss_mlp": 1.0495863, "diversity_loss_mlp": 0.0, "epoch": 0.5706040784917276, "flos": 718222837248.0, "grad_norm": 0.12021988187086102, "language_loss": 0.80887079, "learning_rate": 0.000410450063164256, "loss": 0.81946903, "num_input_tokens_seen": 247105872, "router_z_loss_mlp": 0.10241699, "routerloss_mlp": 0.0, "step": 2966, "time_per_iteration": 2.8859732151031494 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061114, "balance_loss_mlp": 1.05084372, "diversity_loss_mlp": 0.0, "epoch": 0.5707964601769911, "flos": 476707410432.0, "grad_norm": 0.07877125068742231, "language_loss": 0.82298398, "learning_rate": 0.00041014357561680115, "loss": 0.83359516, "num_input_tokens_seen": 247170448, "router_z_loss_mlp": 0.10266113, "routerloss_mlp": 0.0, "step": 2967, "time_per_iteration": 2.5546090602874756 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072036, "balance_loss_mlp": 1.06186163, "diversity_loss_mlp": 0.0, "epoch": 0.5709888418622547, "flos": 580101378048.0, "grad_norm": 0.0603559044145355, "language_loss": 0.86396813, "learning_rate": 0.0004098371229547039, "loss": 0.87468845, "num_input_tokens_seen": 247240400, "router_z_loss_mlp": 0.10174561, "routerloss_mlp": 0.0, "step": 2968, "time_per_iteration": 2.7246880531311035 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055906, "balance_loss_mlp": 1.05082798, "diversity_loss_mlp": 0.0, "epoch": 0.5711812235475183, "flos": 1579922910720.0, "grad_norm": 0.032213471653528905, "language_loss": 0.80010808, "learning_rate": 0.0004095307052969399, "loss": 0.81066716, "num_input_tokens_seen": 247469136, "router_z_loss_mlp": 0.05078125, "routerloss_mlp": 0.0, "step": 2969, "time_per_iteration": 4.802457571029663 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00845784, "balance_loss_mlp": 1.44834208, "diversity_loss_mlp": 0.21849446, "epoch": 0.5713736052327818, "flos": 468506695680.0, "grad_norm": 0.042172582609019446, "language_loss": 0.80489594, "learning_rate": 0.00040922432276247107, "loss": 0.81335378, "num_input_tokens_seen": 247537712, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01236574, "step": 2970, "time_per_iteration": 2.579711675643921 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01100592, "balance_loss_mlp": 1.09026289, "diversity_loss_mlp": 0.0, "epoch": 0.5715659869180454, "flos": 537662499840.0, "grad_norm": 0.08651791755700546, "language_loss": 0.84556907, "learning_rate": 0.0004089179754702457, "loss": 0.85657501, "num_input_tokens_seen": 247613872, "router_z_loss_mlp": 0.10333252, "routerloss_mlp": 0.0, "step": 2971, "time_per_iteration": 2.744509220123291 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109172, "balance_loss_mlp": 1.08128309, "diversity_loss_mlp": 0.0, "epoch": 0.571758368603309, "flos": 656071778304.0, "grad_norm": 0.0875480726861112, "language_loss": 0.79658413, "learning_rate": 0.00040861166353919843, "loss": 0.80750132, "num_input_tokens_seen": 247686064, "router_z_loss_mlp": 0.10443115, "routerloss_mlp": 0.0, "step": 2972, "time_per_iteration": 2.816767692565918 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00843649, "balance_loss_mlp": 1.44322622, "diversity_loss_mlp": 0.21953782, "epoch": 0.5719507502885726, "flos": 667907016192.0, "grad_norm": 0.0303598736791247, "language_loss": 0.81879437, "learning_rate": 0.00040830538708824983, "loss": 0.82723081, "num_input_tokens_seen": 247760384, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01226737, "step": 2973, "time_per_iteration": 2.8936269283294678 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084736, "balance_loss_mlp": 1.07479978, "diversity_loss_mlp": 0.0, "epoch": 0.572143131973836, "flos": 476321969664.0, "grad_norm": 0.06866249599002382, "language_loss": 0.81754982, "learning_rate": 0.000407999146236307, "loss": 0.82839715, "num_input_tokens_seen": 247824768, "router_z_loss_mlp": 0.0993042, "routerloss_mlp": 0.0, "step": 2974, "time_per_iteration": 2.558587074279785 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01086743, "balance_loss_mlp": 1.07657444, "diversity_loss_mlp": 0.0, "epoch": 0.5723355136590996, "flos": 539510782464.0, "grad_norm": 0.07286762161416734, "language_loss": 0.83382261, "learning_rate": 0.0004076929411022634, "loss": 0.84468997, "num_input_tokens_seen": 247894448, "router_z_loss_mlp": 0.10168457, "routerloss_mlp": 0.0, "step": 2975, "time_per_iteration": 2.604498863220215 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082309, "balance_loss_mlp": 1.07231879, "diversity_loss_mlp": 0.0, "epoch": 0.5725278953443632, "flos": 824156674560.0, "grad_norm": 0.06868291627032407, "language_loss": 0.79575276, "learning_rate": 0.0004073867718049982, "loss": 0.80657583, "num_input_tokens_seen": 247976432, "router_z_loss_mlp": 0.09991455, "routerloss_mlp": 0.0, "step": 2976, "time_per_iteration": 3.082519054412842 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00841274, "balance_loss_mlp": 1.44052804, "diversity_loss_mlp": 0.21771878, "epoch": 0.5727202770296268, "flos": 587437235712.0, "grad_norm": 0.03510584247140754, "language_loss": 0.8255651, "learning_rate": 0.00040708063846337704, "loss": 0.83397782, "num_input_tokens_seen": 248048800, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01215104, "step": 2977, "time_per_iteration": 2.7563750743865967 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108914, "balance_loss_mlp": 1.07897186, "diversity_loss_mlp": 0.0, "epoch": 0.5729126587148904, "flos": 446966055936.0, "grad_norm": 0.07105452232664011, "language_loss": 0.81019402, "learning_rate": 0.00040677454119625143, "loss": 0.82108539, "num_input_tokens_seen": 248116496, "router_z_loss_mlp": 0.10168457, "routerloss_mlp": 0.0, "step": 2978, "time_per_iteration": 2.575923442840576 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089611, "balance_loss_mlp": 1.07962155, "diversity_loss_mlp": 0.0, "epoch": 0.5731050404001539, "flos": 519457577472.0, "grad_norm": 0.07243213986729599, "language_loss": 0.82912952, "learning_rate": 0.0004064684801224587, "loss": 0.84002566, "num_input_tokens_seen": 248184960, "router_z_loss_mlp": 0.09985352, "routerloss_mlp": 0.0, "step": 2979, "time_per_iteration": 2.5965535640716553 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085844, "balance_loss_mlp": 1.07600939, "diversity_loss_mlp": 0.0, "epoch": 0.5732974220854175, "flos": 504775950336.0, "grad_norm": 0.11138747568582645, "language_loss": 0.80322999, "learning_rate": 0.00040616245536082224, "loss": 0.81408834, "num_input_tokens_seen": 248252208, "router_z_loss_mlp": 0.0982666, "routerloss_mlp": 0.0, "step": 2980, "time_per_iteration": 2.599320650100708 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079792, "balance_loss_mlp": 1.07008803, "diversity_loss_mlp": 0.0, "epoch": 0.573489803770681, "flos": 592485041664.0, "grad_norm": 0.06764455313032879, "language_loss": 0.81366718, "learning_rate": 0.00040585646703015165, "loss": 0.82446504, "num_input_tokens_seen": 248333312, "router_z_loss_mlp": 0.09698486, "routerloss_mlp": 0.0, "step": 2981, "time_per_iteration": 2.8000056743621826 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083988, "balance_loss_mlp": 1.0740515, "diversity_loss_mlp": 0.0, "epoch": 0.5736821854559446, "flos": 489911514624.0, "grad_norm": 0.07435230765684324, "language_loss": 0.78094304, "learning_rate": 0.0004055505152492419, "loss": 0.79178286, "num_input_tokens_seen": 248403808, "router_z_loss_mlp": 0.0993042, "routerloss_mlp": 0.0, "step": 2982, "time_per_iteration": 2.6867222785949707 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075905, "balance_loss_mlp": 1.06574273, "diversity_loss_mlp": 0.0, "epoch": 0.5738745671412081, "flos": 458156321280.0, "grad_norm": 0.06874763078804642, "language_loss": 0.74040514, "learning_rate": 0.00040524460013687425, "loss": 0.7511642, "num_input_tokens_seen": 248477184, "router_z_loss_mlp": 0.1015625, "routerloss_mlp": 0.0, "step": 2983, "time_per_iteration": 2.722419500350952 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070682, "balance_loss_mlp": 1.06058455, "diversity_loss_mlp": 0.0, "epoch": 0.5740669488264717, "flos": 580333372416.0, "grad_norm": 0.06717754752260814, "language_loss": 0.81118953, "learning_rate": 0.0004049387218118155, "loss": 0.82189637, "num_input_tokens_seen": 248565552, "router_z_loss_mlp": 0.10095215, "routerloss_mlp": 0.0, "step": 2984, "time_per_iteration": 2.960744857788086 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065588, "balance_loss_mlp": 1.05519915, "diversity_loss_mlp": 0.0, "epoch": 0.5742593305117353, "flos": 524438572032.0, "grad_norm": 0.07543134348802799, "language_loss": 0.85138291, "learning_rate": 0.00040463288039281777, "loss": 0.86203879, "num_input_tokens_seen": 248635456, "router_z_loss_mlp": 0.10394287, "routerloss_mlp": 0.0, "step": 2985, "time_per_iteration": 2.769758939743042 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104327, "balance_loss_mlp": 1.03847778, "diversity_loss_mlp": 0.0, "epoch": 0.5744517121969989, "flos": 1553877748224.0, "grad_norm": 0.0202426857746204, "language_loss": 0.77876419, "learning_rate": 0.0004043270759986194, "loss": 0.78919691, "num_input_tokens_seen": 248870160, "router_z_loss_mlp": 0.04785156, "routerloss_mlp": 0.0, "step": 2986, "time_per_iteration": 4.966659784317017 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062164, "balance_loss_mlp": 1.05206716, "diversity_loss_mlp": 0.0, "epoch": 0.5746440938822625, "flos": 751919915520.0, "grad_norm": 0.15131369926607025, "language_loss": 0.82060635, "learning_rate": 0.0004040213087479444, "loss": 0.83122802, "num_input_tokens_seen": 248946960, "router_z_loss_mlp": 0.10095215, "routerloss_mlp": 0.0, "step": 2987, "time_per_iteration": 2.9445290565490723 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071505, "balance_loss_mlp": 1.0615747, "diversity_loss_mlp": 0.0, "epoch": 0.5748364755675259, "flos": 501865320960.0, "grad_norm": 0.0782867157663105, "language_loss": 0.85397077, "learning_rate": 0.0004037155787595018, "loss": 0.86468589, "num_input_tokens_seen": 249014128, "router_z_loss_mlp": 0.0993042, "routerloss_mlp": 0.0, "step": 2988, "time_per_iteration": 2.5765254497528076 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066911, "balance_loss_mlp": 1.05708241, "diversity_loss_mlp": 0.0, "epoch": 0.5750288572527895, "flos": 504044342784.0, "grad_norm": 0.06722963936024443, "language_loss": 0.80743146, "learning_rate": 0.000403409886151987, "loss": 0.81810057, "num_input_tokens_seen": 249090016, "router_z_loss_mlp": 0.0982666, "routerloss_mlp": 0.0, "step": 2989, "time_per_iteration": 2.916736364364624 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01028923, "balance_loss_mlp": 1.02410662, "diversity_loss_mlp": 0.0, "epoch": 0.5752212389380531, "flos": 1541365604352.0, "grad_norm": 0.01652195359171043, "language_loss": 0.81999105, "learning_rate": 0.0004031042310440799, "loss": 0.8302803, "num_input_tokens_seen": 249305552, "router_z_loss_mlp": 0.0480957, "routerloss_mlp": 0.0, "step": 2990, "time_per_iteration": 4.79939866065979 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01019783, "balance_loss_mlp": 1.0149194, "diversity_loss_mlp": 0.0, "epoch": 0.5754136206233167, "flos": 1567331472384.0, "grad_norm": 0.012607930583697005, "language_loss": 0.781986, "learning_rate": 0.00040279861355444656, "loss": 0.79218388, "num_input_tokens_seen": 249523408, "router_z_loss_mlp": 0.04858398, "routerloss_mlp": 0.0, "step": 2991, "time_per_iteration": 4.873241901397705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107448, "balance_loss_mlp": 1.06493187, "diversity_loss_mlp": 0.0, "epoch": 0.5756060023085803, "flos": 798156301824.0, "grad_norm": 0.07321689676824589, "language_loss": 0.7675758, "learning_rate": 0.00040249303380173807, "loss": 0.77832061, "num_input_tokens_seen": 249616624, "router_z_loss_mlp": 0.09533691, "routerloss_mlp": 0.0, "step": 2992, "time_per_iteration": 3.119454860687256 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075897, "balance_loss_mlp": 1.06607461, "diversity_loss_mlp": 0.0, "epoch": 0.5757983839938438, "flos": 587877004800.0, "grad_norm": 0.06951674167184135, "language_loss": 0.78929973, "learning_rate": 0.00040218749190459126, "loss": 0.80005872, "num_input_tokens_seen": 249689936, "router_z_loss_mlp": 0.09814453, "routerloss_mlp": 0.0, "step": 2993, "time_per_iteration": 2.735741138458252 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074749, "balance_loss_mlp": 1.06464601, "diversity_loss_mlp": 0.0, "epoch": 0.5759907656791073, "flos": 516831072768.0, "grad_norm": 0.09040694151318206, "language_loss": 0.82524914, "learning_rate": 0.00040188198798162775, "loss": 0.83599663, "num_input_tokens_seen": 249759984, "router_z_loss_mlp": 0.10101318, "routerloss_mlp": 0.0, "step": 2994, "time_per_iteration": 2.604189872741699 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107928, "balance_loss_mlp": 1.06903386, "diversity_loss_mlp": 0.0, "epoch": 0.5761831473643709, "flos": 587133287424.0, "grad_norm": 0.07247823517444965, "language_loss": 0.85413349, "learning_rate": 0.000401576522151455, "loss": 0.86492634, "num_input_tokens_seen": 249837888, "router_z_loss_mlp": 0.10247803, "routerloss_mlp": 0.0, "step": 2995, "time_per_iteration": 2.8580820560455322 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082336, "balance_loss_mlp": 1.07231033, "diversity_loss_mlp": 0.0, "epoch": 0.5763755290496345, "flos": 543896363520.0, "grad_norm": 0.07641213429349043, "language_loss": 0.82611746, "learning_rate": 0.0004012710945326651, "loss": 0.83694082, "num_input_tokens_seen": 249913584, "router_z_loss_mlp": 0.10021973, "routerloss_mlp": 0.0, "step": 2996, "time_per_iteration": 2.7899913787841797 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01093927, "balance_loss_mlp": 1.08396673, "diversity_loss_mlp": 0.0, "epoch": 0.576567910734898, "flos": 626229107712.0, "grad_norm": 0.06499516885792743, "language_loss": 0.81305802, "learning_rate": 0.0004009657052438355, "loss": 0.82399726, "num_input_tokens_seen": 249992144, "router_z_loss_mlp": 0.0994873, "routerloss_mlp": 0.0, "step": 2997, "time_per_iteration": 2.7985143661499023 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109354, "balance_loss_mlp": 1.08339536, "diversity_loss_mlp": 0.0, "epoch": 0.5767602924201616, "flos": 538243232256.0, "grad_norm": 0.07919341256021087, "language_loss": 0.85873878, "learning_rate": 0.00040066035440352904, "loss": 0.86967415, "num_input_tokens_seen": 250060736, "router_z_loss_mlp": 0.10137939, "routerloss_mlp": 0.0, "step": 2998, "time_per_iteration": 2.633052110671997 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01032353, "balance_loss_mlp": 1.02706063, "diversity_loss_mlp": 0.0, "epoch": 0.5769526741054252, "flos": 1559778301440.0, "grad_norm": 0.024696349234847453, "language_loss": 0.79293132, "learning_rate": 0.0004003550421302934, "loss": 0.80325484, "num_input_tokens_seen": 250296864, "router_z_loss_mlp": 0.05297852, "routerloss_mlp": 0.0, "step": 2999, "time_per_iteration": 4.901000022888184 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111633, "balance_loss_mlp": 1.10161996, "diversity_loss_mlp": 0.0, "epoch": 0.5771450557906888, "flos": 468185495040.0, "grad_norm": 0.09685011562347093, "language_loss": 0.76085562, "learning_rate": 0.00040004976854266145, "loss": 0.77197194, "num_input_tokens_seen": 250362528, "router_z_loss_mlp": 0.10009766, "routerloss_mlp": 0.0, "step": 3000, "time_per_iteration": 2.5440561771392822 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01106478, "balance_loss_mlp": 1.09615445, "diversity_loss_mlp": 0.0, "epoch": 0.5773374374759523, "flos": 574556903424.0, "grad_norm": 0.08566214489971447, "language_loss": 0.81596673, "learning_rate": 0.0003997445337591505, "loss": 0.82703155, "num_input_tokens_seen": 250432768, "router_z_loss_mlp": 0.10327148, "routerloss_mlp": 0.0, "step": 3001, "time_per_iteration": 2.6576101779937744 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101254, "balance_loss_mlp": 1.09120488, "diversity_loss_mlp": 0.0, "epoch": 0.5775298191612158, "flos": 528473590272.0, "grad_norm": 0.07034086792873868, "language_loss": 0.74008942, "learning_rate": 0.0003994393378982635, "loss": 0.75110197, "num_input_tokens_seen": 250501504, "router_z_loss_mlp": 0.10046387, "routerloss_mlp": 0.0, "step": 3002, "time_per_iteration": 2.646756172180176 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01033287, "balance_loss_mlp": 1.02816153, "diversity_loss_mlp": 0.0, "epoch": 0.5777222008464794, "flos": 1303919700480.0, "grad_norm": 0.018933197318392565, "language_loss": 0.79538, "learning_rate": 0.00039913418107848786, "loss": 0.80571294, "num_input_tokens_seen": 250733632, "router_z_loss_mlp": 0.05126953, "routerloss_mlp": 0.0, "step": 3003, "time_per_iteration": 4.810927867889404 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084692, "balance_loss_mlp": 1.07440448, "diversity_loss_mlp": 0.0, "epoch": 0.577914582531743, "flos": 603633461760.0, "grad_norm": 0.09168460196837042, "language_loss": 0.8788178, "learning_rate": 0.0003988290634182961, "loss": 0.88966477, "num_input_tokens_seen": 250809152, "router_z_loss_mlp": 0.10290527, "routerloss_mlp": 0.0, "step": 3004, "time_per_iteration": 2.8026678562164307 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01086517, "balance_loss_mlp": 1.0765686, "diversity_loss_mlp": 0.0, "epoch": 0.5781069642170066, "flos": 486795681792.0, "grad_norm": 0.07023697016091271, "language_loss": 0.80836314, "learning_rate": 0.0003985239850361453, "loss": 0.81922829, "num_input_tokens_seen": 250879152, "router_z_loss_mlp": 0.09942627, "routerloss_mlp": 0.0, "step": 3005, "time_per_iteration": 2.605581760406494 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108379, "balance_loss_mlp": 1.0739491, "diversity_loss_mlp": 0.0, "epoch": 0.5782993459022701, "flos": 506295318528.0, "grad_norm": 0.08589270039345176, "language_loss": 0.84542817, "learning_rate": 0.0003982189460504777, "loss": 0.85626608, "num_input_tokens_seen": 250949904, "router_z_loss_mlp": 0.09838867, "routerloss_mlp": 0.0, "step": 3006, "time_per_iteration": 2.755309820175171 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081707, "balance_loss_mlp": 1.07148504, "diversity_loss_mlp": 0.0, "epoch": 0.5784917275875336, "flos": 602155938816.0, "grad_norm": 0.07367765629951939, "language_loss": 0.79058981, "learning_rate": 0.00039791394657971935, "loss": 0.80140698, "num_input_tokens_seen": 251020976, "router_z_loss_mlp": 0.10223389, "routerloss_mlp": 0.0, "step": 3007, "time_per_iteration": 2.7115721702575684 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083463, "balance_loss_mlp": 1.07349145, "diversity_loss_mlp": 0.0, "epoch": 0.5786841092727972, "flos": 521540425728.0, "grad_norm": 0.08639799759711958, "language_loss": 0.84195948, "learning_rate": 0.00039760898674228205, "loss": 0.85279417, "num_input_tokens_seen": 251093280, "router_z_loss_mlp": 0.09967041, "routerloss_mlp": 0.0, "step": 3008, "time_per_iteration": 2.6536192893981934 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082898, "balance_loss_mlp": 1.07249665, "diversity_loss_mlp": 0.0, "epoch": 0.5788764909580608, "flos": 767404357632.0, "grad_norm": 0.06522284264232586, "language_loss": 0.80620825, "learning_rate": 0.0003973040666565613, "loss": 0.81703728, "num_input_tokens_seen": 251181376, "router_z_loss_mlp": 0.10406494, "routerloss_mlp": 0.0, "step": 3009, "time_per_iteration": 3.0663528442382812 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083981, "balance_loss_mlp": 1.07382393, "diversity_loss_mlp": 0.0, "epoch": 0.5790688726433244, "flos": 599094434304.0, "grad_norm": 0.06612730330601824, "language_loss": 0.82148051, "learning_rate": 0.000396999186440938, "loss": 0.83232027, "num_input_tokens_seen": 251256176, "router_z_loss_mlp": 0.1015625, "routerloss_mlp": 0.0, "step": 3010, "time_per_iteration": 2.8332176208496094 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078314, "balance_loss_mlp": 1.06794286, "diversity_loss_mlp": 0.0, "epoch": 0.5792612543285879, "flos": 523064936448.0, "grad_norm": 0.0828593686110812, "language_loss": 0.85258269, "learning_rate": 0.000396694346213777, "loss": 0.86336583, "num_input_tokens_seen": 251325344, "router_z_loss_mlp": 0.10369873, "routerloss_mlp": 0.0, "step": 3011, "time_per_iteration": 2.6009714603424072 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107282, "balance_loss_mlp": 1.06272256, "diversity_loss_mlp": 0.0, "epoch": 0.5794536360138515, "flos": 876557915136.0, "grad_norm": 0.06962390382868744, "language_loss": 0.83265769, "learning_rate": 0.0003963895460934276, "loss": 0.84338593, "num_input_tokens_seen": 251406656, "router_z_loss_mlp": 0.10095215, "routerloss_mlp": 0.0, "step": 3012, "time_per_iteration": 3.1654391288757324 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069146, "balance_loss_mlp": 1.05900097, "diversity_loss_mlp": 0.0, "epoch": 0.5796460176991151, "flos": 401436311040.0, "grad_norm": 0.07925389671051855, "language_loss": 0.84790504, "learning_rate": 0.00039608478619822376, "loss": 0.85859656, "num_input_tokens_seen": 251467760, "router_z_loss_mlp": 0.10144043, "routerloss_mlp": 0.0, "step": 3013, "time_per_iteration": 2.427522659301758 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067828, "balance_loss_mlp": 1.05792189, "diversity_loss_mlp": 0.0, "epoch": 0.5798383993843786, "flos": 618517721088.0, "grad_norm": 0.06006231039706783, "language_loss": 0.82350284, "learning_rate": 0.00039578006664648394, "loss": 0.83418107, "num_input_tokens_seen": 251542272, "router_z_loss_mlp": 0.09899902, "routerloss_mlp": 0.0, "step": 3014, "time_per_iteration": 2.744586229324341 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073341, "balance_loss_mlp": 1.06352377, "diversity_loss_mlp": 0.0, "epoch": 0.5800307810696421, "flos": 844331019264.0, "grad_norm": 0.06972986465808689, "language_loss": 0.81348431, "learning_rate": 0.0003954753875565105, "loss": 0.82421774, "num_input_tokens_seen": 251625584, "router_z_loss_mlp": 0.0980835, "routerloss_mlp": 0.0, "step": 3015, "time_per_iteration": 3.0640695095062256 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072847, "balance_loss_mlp": 1.06282723, "diversity_loss_mlp": 0.0, "epoch": 0.5802231627549057, "flos": 569276729856.0, "grad_norm": 0.07357715078918559, "language_loss": 0.82623494, "learning_rate": 0.00039517074904659057, "loss": 0.83696342, "num_input_tokens_seen": 251696704, "router_z_loss_mlp": 0.10015869, "routerloss_mlp": 0.0, "step": 3016, "time_per_iteration": 2.6665265560150146 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010727, "balance_loss_mlp": 1.06269789, "diversity_loss_mlp": 0.0, "epoch": 0.5804155444401693, "flos": 660459930624.0, "grad_norm": 0.06753013197016527, "language_loss": 0.84737754, "learning_rate": 0.00039486615123499535, "loss": 0.85810453, "num_input_tokens_seen": 251774784, "router_z_loss_mlp": 0.10003662, "routerloss_mlp": 0.0, "step": 3017, "time_per_iteration": 2.868724822998047 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067949, "balance_loss_mlp": 1.05761325, "diversity_loss_mlp": 0.0, "epoch": 0.5806079261254329, "flos": 513992024064.0, "grad_norm": 0.06414820954678578, "language_loss": 0.84855384, "learning_rate": 0.00039456159423997996, "loss": 0.85923326, "num_input_tokens_seen": 251844768, "router_z_loss_mlp": 0.10333252, "routerloss_mlp": 0.0, "step": 3018, "time_per_iteration": 2.7043581008911133 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067563, "balance_loss_mlp": 1.05765033, "diversity_loss_mlp": 0.0, "epoch": 0.5808003078106965, "flos": 528646487040.0, "grad_norm": 0.06908857206879536, "language_loss": 0.89950442, "learning_rate": 0.00039425707817978406, "loss": 0.91018009, "num_input_tokens_seen": 251912736, "router_z_loss_mlp": 0.09906006, "routerloss_mlp": 0.0, "step": 3019, "time_per_iteration": 2.661128044128418 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106838, "balance_loss_mlp": 1.0578835, "diversity_loss_mlp": 0.0, "epoch": 0.58099268949596, "flos": 477028611072.0, "grad_norm": 0.08125232064199928, "language_loss": 0.83649898, "learning_rate": 0.00039395260317263124, "loss": 0.84718275, "num_input_tokens_seen": 251979328, "router_z_loss_mlp": 0.1050415, "routerloss_mlp": 0.0, "step": 3020, "time_per_iteration": 2.5645148754119873 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070808, "balance_loss_mlp": 1.06039524, "diversity_loss_mlp": 0.0, "epoch": 0.5811850711812235, "flos": 517609294848.0, "grad_norm": 0.06887634041791851, "language_loss": 0.85043871, "learning_rate": 0.0003936481693367291, "loss": 0.86114681, "num_input_tokens_seen": 252050928, "router_z_loss_mlp": 0.10418701, "routerloss_mlp": 0.0, "step": 3021, "time_per_iteration": 2.7062771320343018 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077533, "balance_loss_mlp": 1.06673217, "diversity_loss_mlp": 0.0, "epoch": 0.5813774528664871, "flos": 616422389760.0, "grad_norm": 0.08641696356618225, "language_loss": 0.87619507, "learning_rate": 0.0003933437767902697, "loss": 0.88697034, "num_input_tokens_seen": 252126496, "router_z_loss_mlp": 0.10803223, "routerloss_mlp": 0.0, "step": 3022, "time_per_iteration": 2.7680017948150635 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078804, "balance_loss_mlp": 1.06846249, "diversity_loss_mlp": 0.0, "epoch": 0.5815698345517507, "flos": 567475435008.0, "grad_norm": 0.0708496595357851, "language_loss": 0.78467089, "learning_rate": 0.00039303942565142825, "loss": 0.79545891, "num_input_tokens_seen": 252203008, "router_z_loss_mlp": 0.10345459, "routerloss_mlp": 0.0, "step": 3023, "time_per_iteration": 2.7319986820220947 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071706, "balance_loss_mlp": 1.06121564, "diversity_loss_mlp": 0.0, "epoch": 0.5817622162370142, "flos": 563168775168.0, "grad_norm": 0.06941107329713525, "language_loss": 0.76844412, "learning_rate": 0.0003927351160383644, "loss": 0.77916121, "num_input_tokens_seen": 252283440, "router_z_loss_mlp": 0.10498047, "routerloss_mlp": 0.0, "step": 3024, "time_per_iteration": 2.7925262451171875 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069902, "balance_loss_mlp": 1.05980492, "diversity_loss_mlp": 0.0, "epoch": 0.5819545979222778, "flos": 459216470016.0, "grad_norm": 0.07084631667240687, "language_loss": 0.77815473, "learning_rate": 0.000392430848069222, "loss": 0.78885376, "num_input_tokens_seen": 252351760, "router_z_loss_mlp": 0.10095215, "routerloss_mlp": 0.0, "step": 3025, "time_per_iteration": 2.5290136337280273 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075514, "balance_loss_mlp": 1.06532741, "diversity_loss_mlp": 0.0, "epoch": 0.5821469796075414, "flos": 541475062272.0, "grad_norm": 0.07224483468752362, "language_loss": 0.82501459, "learning_rate": 0.00039212662186212795, "loss": 0.83576977, "num_input_tokens_seen": 252418480, "router_z_loss_mlp": 0.10186768, "routerloss_mlp": 0.0, "step": 3026, "time_per_iteration": 2.6017684936523438 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106609, "balance_loss_mlp": 1.05593956, "diversity_loss_mlp": 0.0, "epoch": 0.582339361292805, "flos": 552262634496.0, "grad_norm": 0.05478704818063415, "language_loss": 0.77076197, "learning_rate": 0.0003918224375351934, "loss": 0.78142285, "num_input_tokens_seen": 252493712, "router_z_loss_mlp": 0.10150146, "routerloss_mlp": 0.0, "step": 3027, "time_per_iteration": 2.707127571105957 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069708, "balance_loss_mlp": 1.05940795, "diversity_loss_mlp": 0.0, "epoch": 0.5825317429780685, "flos": 496399767552.0, "grad_norm": 0.07026049561627037, "language_loss": 0.78559566, "learning_rate": 0.0003915182952065135, "loss": 0.79629278, "num_input_tokens_seen": 252566096, "router_z_loss_mlp": 0.10302734, "routerloss_mlp": 0.0, "step": 3028, "time_per_iteration": 2.6728062629699707 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00863261, "balance_loss_mlp": 1.48110199, "diversity_loss_mlp": 0.21947324, "epoch": 0.582724124663332, "flos": 564162112512.0, "grad_norm": 0.028926470462326558, "language_loss": 0.87632734, "learning_rate": 0.0003912141949941664, "loss": 0.88495994, "num_input_tokens_seen": 252639424, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0129736, "step": 3029, "time_per_iteration": 2.7290279865264893 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068105, "balance_loss_mlp": 1.05748928, "diversity_loss_mlp": 0.0, "epoch": 0.5829165063485956, "flos": 492132754944.0, "grad_norm": 0.11092566755711959, "language_loss": 0.82848042, "learning_rate": 0.0003909101370162143, "loss": 0.83916146, "num_input_tokens_seen": 252706672, "router_z_loss_mlp": 0.10620117, "routerloss_mlp": 0.0, "step": 3030, "time_per_iteration": 2.5907628536224365 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057716, "balance_loss_mlp": 1.05161262, "diversity_loss_mlp": 0.0, "epoch": 0.5831088880338592, "flos": 1528880997888.0, "grad_norm": 0.028764883169419067, "language_loss": 0.72433889, "learning_rate": 0.00039060612139070326, "loss": 0.73491609, "num_input_tokens_seen": 252932464, "router_z_loss_mlp": 0.06103516, "routerloss_mlp": 0.0, "step": 3031, "time_per_iteration": 4.87787127494812 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066859, "balance_loss_mlp": 1.05651772, "diversity_loss_mlp": 0.0, "epoch": 0.5833012697191228, "flos": 618011140608.0, "grad_norm": 0.06710106844205427, "language_loss": 0.82853395, "learning_rate": 0.0003903021482356622, "loss": 0.83920258, "num_input_tokens_seen": 253011920, "router_z_loss_mlp": 0.10345459, "routerloss_mlp": 0.0, "step": 3032, "time_per_iteration": 2.777536153793335 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067707, "balance_loss_mlp": 1.05757427, "diversity_loss_mlp": 0.0, "epoch": 0.5834936514043862, "flos": 767920849920.0, "grad_norm": 0.05521171326439417, "language_loss": 0.82775813, "learning_rate": 0.00038999821766910465, "loss": 0.83843517, "num_input_tokens_seen": 253091552, "router_z_loss_mlp": 0.10131836, "routerloss_mlp": 0.0, "step": 3033, "time_per_iteration": 2.990370035171509 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064798, "balance_loss_mlp": 1.05444503, "diversity_loss_mlp": 0.0, "epoch": 0.5836860330896498, "flos": 458371436544.0, "grad_norm": 0.06933125597123427, "language_loss": 0.85725427, "learning_rate": 0.00038969432980902606, "loss": 0.86790228, "num_input_tokens_seen": 253158608, "router_z_loss_mlp": 0.10357666, "routerloss_mlp": 0.0, "step": 3034, "time_per_iteration": 2.522594690322876 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0101659, "balance_loss_mlp": 1.01134527, "diversity_loss_mlp": 0.0, "epoch": 0.5838784147749134, "flos": 1361225585664.0, "grad_norm": 0.016170176694849804, "language_loss": 0.79784501, "learning_rate": 0.0003893904847734068, "loss": 0.80801094, "num_input_tokens_seen": 253381184, "router_z_loss_mlp": 0.05249023, "routerloss_mlp": 0.0, "step": 3035, "time_per_iteration": 4.804777383804321 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070254, "balance_loss_mlp": 1.06007361, "diversity_loss_mlp": 0.0, "epoch": 0.584070796460177, "flos": 567211133952.0, "grad_norm": 0.06630987198212972, "language_loss": 0.82630336, "learning_rate": 0.00038908668268020953, "loss": 0.83700585, "num_input_tokens_seen": 253452880, "router_z_loss_mlp": 0.10180664, "routerloss_mlp": 0.0, "step": 3036, "time_per_iteration": 2.6598165035247803 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064922, "balance_loss_mlp": 1.0547123, "diversity_loss_mlp": 0.0, "epoch": 0.5842631781454406, "flos": 611483240448.0, "grad_norm": 0.06353975651870693, "language_loss": 0.85077345, "learning_rate": 0.00038878292364738097, "loss": 0.86142278, "num_input_tokens_seen": 253530000, "router_z_loss_mlp": 0.10211182, "routerloss_mlp": 0.0, "step": 3037, "time_per_iteration": 2.817431688308716 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066587, "balance_loss_mlp": 1.05653155, "diversity_loss_mlp": 0.0, "epoch": 0.5844555598307041, "flos": 463384737792.0, "grad_norm": 0.06847185322789755, "language_loss": 0.86992419, "learning_rate": 0.0003884792077928508, "loss": 0.88059008, "num_input_tokens_seen": 253593504, "router_z_loss_mlp": 0.10058594, "routerloss_mlp": 0.0, "step": 3038, "time_per_iteration": 2.515582323074341 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067425, "balance_loss_mlp": 1.05704808, "diversity_loss_mlp": 0.0, "epoch": 0.5846479415159677, "flos": 410215186944.0, "grad_norm": 0.08132102193369704, "language_loss": 0.76704037, "learning_rate": 0.0003881755352345322, "loss": 0.77771461, "num_input_tokens_seen": 253657904, "router_z_loss_mlp": 0.1038208, "routerloss_mlp": 0.0, "step": 3039, "time_per_iteration": 2.506476402282715 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070034, "balance_loss_mlp": 1.05959702, "diversity_loss_mlp": 0.0, "epoch": 0.5848403232012312, "flos": 491297633280.0, "grad_norm": 0.05655703451029381, "language_loss": 0.87182224, "learning_rate": 0.0003878719060903207, "loss": 0.88252252, "num_input_tokens_seen": 253725280, "router_z_loss_mlp": 0.10437012, "routerloss_mlp": 0.0, "step": 3040, "time_per_iteration": 2.5755503177642822 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077595, "balance_loss_mlp": 1.06733704, "diversity_loss_mlp": 0.0, "epoch": 0.5850327048864948, "flos": 584417949696.0, "grad_norm": 0.07213898072930079, "language_loss": 0.83620822, "learning_rate": 0.0003875683204780961, "loss": 0.84698415, "num_input_tokens_seen": 253795040, "router_z_loss_mlp": 0.1026001, "routerloss_mlp": 0.0, "step": 3041, "time_per_iteration": 2.7087528705596924 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00858209, "balance_loss_mlp": 1.47420132, "diversity_loss_mlp": 0.21720865, "epoch": 0.5852250865717584, "flos": 651545233920.0, "grad_norm": 0.0337374590034744, "language_loss": 0.85750413, "learning_rate": 0.00038726477851572043, "loss": 0.86608613, "num_input_tokens_seen": 253866384, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01250451, "step": 3042, "time_per_iteration": 2.8391060829162598 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085797, "balance_loss_mlp": 1.07552087, "diversity_loss_mlp": 0.0, "epoch": 0.5854174682570219, "flos": 534588885504.0, "grad_norm": 0.07424787281712622, "language_loss": 0.8043561, "learning_rate": 0.0003869612803210395, "loss": 0.81521404, "num_input_tokens_seen": 253935712, "router_z_loss_mlp": 0.1027832, "routerloss_mlp": 0.0, "step": 3043, "time_per_iteration": 2.6728439331054688 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085503, "balance_loss_mlp": 1.07525158, "diversity_loss_mlp": 0.0, "epoch": 0.5856098499422855, "flos": 509752175616.0, "grad_norm": 0.0731909762270397, "language_loss": 0.83286428, "learning_rate": 0.0003866578260118817, "loss": 0.8437193, "num_input_tokens_seen": 254003152, "router_z_loss_mlp": 0.10253906, "routerloss_mlp": 0.0, "step": 3044, "time_per_iteration": 2.6332969665527344 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108221, "balance_loss_mlp": 1.07239914, "diversity_loss_mlp": 0.0, "epoch": 0.5858022316275491, "flos": 593893555200.0, "grad_norm": 0.07445534470947208, "language_loss": 0.82966632, "learning_rate": 0.0003863544157060581, "loss": 0.84048843, "num_input_tokens_seen": 254072816, "router_z_loss_mlp": 0.0980835, "routerloss_mlp": 0.0, "step": 3045, "time_per_iteration": 2.668837785720825 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081605, "balance_loss_mlp": 1.07137656, "diversity_loss_mlp": 0.0, "epoch": 0.5859946133128127, "flos": 559126416384.0, "grad_norm": 0.07387128485113956, "language_loss": 0.82359195, "learning_rate": 0.0003860510495213634, "loss": 0.83440793, "num_input_tokens_seen": 254152800, "router_z_loss_mlp": 0.10223389, "routerloss_mlp": 0.0, "step": 3046, "time_per_iteration": 2.8229498863220215 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106696, "balance_loss_mlp": 1.05705416, "diversity_loss_mlp": 0.0, "epoch": 0.5861869949980761, "flos": 553695740928.0, "grad_norm": 0.08160785595799389, "language_loss": 0.78622752, "learning_rate": 0.0003857477275755746, "loss": 0.79689717, "num_input_tokens_seen": 254224384, "router_z_loss_mlp": 0.09899902, "routerloss_mlp": 0.0, "step": 3047, "time_per_iteration": 2.6294050216674805 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066008, "balance_loss_mlp": 1.0557915, "diversity_loss_mlp": 0.0, "epoch": 0.5863793766833397, "flos": 718667375616.0, "grad_norm": 0.0580402220657833, "language_loss": 0.83646655, "learning_rate": 0.00038544444998645167, "loss": 0.84712666, "num_input_tokens_seen": 254310960, "router_z_loss_mlp": 0.10217285, "routerloss_mlp": 0.0, "step": 3048, "time_per_iteration": 3.0289785861968994 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059507, "balance_loss_mlp": 1.04951751, "diversity_loss_mlp": 0.0, "epoch": 0.5865717583686033, "flos": 472289522688.0, "grad_norm": 0.0674332369398686, "language_loss": 0.81847656, "learning_rate": 0.00038514121687173767, "loss": 0.82907164, "num_input_tokens_seen": 254378336, "router_z_loss_mlp": 0.09991455, "routerloss_mlp": 0.0, "step": 3049, "time_per_iteration": 2.5797152519226074 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058576, "balance_loss_mlp": 1.04861593, "diversity_loss_mlp": 0.0, "epoch": 0.5867641400538669, "flos": 813482901504.0, "grad_norm": 0.08495884025795868, "language_loss": 0.82019609, "learning_rate": 0.00038483802834915807, "loss": 0.83078188, "num_input_tokens_seen": 254454352, "router_z_loss_mlp": 0.09960938, "routerloss_mlp": 0.0, "step": 3050, "time_per_iteration": 3.0199241638183594 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061818, "balance_loss_mlp": 1.05154216, "diversity_loss_mlp": 0.0, "epoch": 0.5869565217391305, "flos": 486531380736.0, "grad_norm": 0.07816426751212531, "language_loss": 0.78978479, "learning_rate": 0.00038453488453642074, "loss": 0.800403, "num_input_tokens_seen": 254526352, "router_z_loss_mlp": 0.1027832, "routerloss_mlp": 0.0, "step": 3051, "time_per_iteration": 2.7338953018188477 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105642, "balance_loss_mlp": 1.04610801, "diversity_loss_mlp": 0.0, "epoch": 0.587148903424394, "flos": 569385386496.0, "grad_norm": 0.07385283463746846, "language_loss": 0.86878967, "learning_rate": 0.00038423178555121697, "loss": 0.87935388, "num_input_tokens_seen": 254598720, "router_z_loss_mlp": 0.10308838, "routerloss_mlp": 0.0, "step": 3052, "time_per_iteration": 2.7545297145843506 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058315, "balance_loss_mlp": 1.04783666, "diversity_loss_mlp": 0.0, "epoch": 0.5873412851096576, "flos": 747296824320.0, "grad_norm": 0.07920619209623277, "language_loss": 0.85583031, "learning_rate": 0.00038392873151121994, "loss": 0.86641347, "num_input_tokens_seen": 254683664, "router_z_loss_mlp": 0.1048584, "routerloss_mlp": 0.0, "step": 3053, "time_per_iteration": 3.07143235206604 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059791, "balance_loss_mlp": 1.04924083, "diversity_loss_mlp": 0.0, "epoch": 0.5875336667949211, "flos": 528142477824.0, "grad_norm": 0.07754087781816771, "language_loss": 0.83137167, "learning_rate": 0.0003836257225340859, "loss": 0.84196955, "num_input_tokens_seen": 254754688, "router_z_loss_mlp": 0.10552979, "routerloss_mlp": 0.0, "step": 3054, "time_per_iteration": 2.6132304668426514 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066843, "balance_loss_mlp": 1.05597091, "diversity_loss_mlp": 0.0, "epoch": 0.5877260484801847, "flos": 824166586368.0, "grad_norm": 0.0689474058081498, "language_loss": 0.82020974, "learning_rate": 0.00038332275873745336, "loss": 0.83087826, "num_input_tokens_seen": 254838976, "router_z_loss_mlp": 0.10882568, "routerloss_mlp": 0.0, "step": 3055, "time_per_iteration": 3.107823371887207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00855378, "balance_loss_mlp": 1.46855807, "diversity_loss_mlp": 0.21676093, "epoch": 0.5879184301654482, "flos": 591598162944.0, "grad_norm": 0.026786885849911755, "language_loss": 0.82891941, "learning_rate": 0.0003830198402389431, "loss": 0.83747321, "num_input_tokens_seen": 254912912, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01271825, "step": 3056, "time_per_iteration": 2.7645249366760254 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01040709, "balance_loss_mlp": 1.03548789, "diversity_loss_mlp": 0.0, "epoch": 0.5881108118507118, "flos": 1545805513728.0, "grad_norm": 0.027829027984012215, "language_loss": 0.77348936, "learning_rate": 0.0003827169671561585, "loss": 0.78389645, "num_input_tokens_seen": 255151488, "router_z_loss_mlp": 0.05224609, "routerloss_mlp": 0.0, "step": 3057, "time_per_iteration": 4.995454549789429 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082248, "balance_loss_mlp": 1.07115602, "diversity_loss_mlp": 0.0, "epoch": 0.5883031935359754, "flos": 489597654528.0, "grad_norm": 0.10105227922023945, "language_loss": 0.83302426, "learning_rate": 0.0003824141396066855, "loss": 0.8438468, "num_input_tokens_seen": 255218896, "router_z_loss_mlp": 0.11096191, "routerloss_mlp": 0.0, "step": 3058, "time_per_iteration": 2.568283796310425 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01086707, "balance_loss_mlp": 1.07570362, "diversity_loss_mlp": 0.0, "epoch": 0.588495575221239, "flos": 582836539392.0, "grad_norm": 0.10870959422332387, "language_loss": 0.8283565, "learning_rate": 0.000382111357708092, "loss": 0.83922356, "num_input_tokens_seen": 255287408, "router_z_loss_mlp": 0.10998535, "routerloss_mlp": 0.0, "step": 3059, "time_per_iteration": 2.7063958644866943 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080617, "balance_loss_mlp": 1.06985879, "diversity_loss_mlp": 0.0, "epoch": 0.5886879569065026, "flos": 661048003584.0, "grad_norm": 0.09017347087331092, "language_loss": 0.83373827, "learning_rate": 0.00038180862157792864, "loss": 0.84454447, "num_input_tokens_seen": 255358432, "router_z_loss_mlp": 0.10760498, "routerloss_mlp": 0.0, "step": 3060, "time_per_iteration": 2.7716259956359863 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071354, "balance_loss_mlp": 1.06098306, "diversity_loss_mlp": 0.0, "epoch": 0.588880338591766, "flos": 562657425408.0, "grad_norm": 0.06780881013643715, "language_loss": 0.81814772, "learning_rate": 0.0003815059313337279, "loss": 0.82886124, "num_input_tokens_seen": 255425744, "router_z_loss_mlp": 0.10375977, "routerloss_mlp": 0.0, "step": 3061, "time_per_iteration": 2.664134979248047 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072016, "balance_loss_mlp": 1.06180596, "diversity_loss_mlp": 0.0, "epoch": 0.5890727202770296, "flos": 554730923520.0, "grad_norm": 0.06335749004143083, "language_loss": 0.78063929, "learning_rate": 0.00038120328709300436, "loss": 0.79135942, "num_input_tokens_seen": 255505808, "router_z_loss_mlp": 0.10211182, "routerloss_mlp": 0.0, "step": 3062, "time_per_iteration": 2.8627028465270996 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066046, "balance_loss_mlp": 1.05566847, "diversity_loss_mlp": 0.0, "epoch": 0.5892651019622932, "flos": 655520781312.0, "grad_norm": 0.06769296518732247, "language_loss": 0.8382163, "learning_rate": 0.0003809006889732549, "loss": 0.84887671, "num_input_tokens_seen": 255580160, "router_z_loss_mlp": 0.1038208, "routerloss_mlp": 0.0, "step": 3063, "time_per_iteration": 2.809983253479004 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066721, "balance_loss_mlp": 1.05686879, "diversity_loss_mlp": 0.0, "epoch": 0.5894574836475568, "flos": 453202490880.0, "grad_norm": 0.07471445768221775, "language_loss": 0.88052714, "learning_rate": 0.0003805981370919589, "loss": 0.89119434, "num_input_tokens_seen": 255644016, "router_z_loss_mlp": 0.09844971, "routerloss_mlp": 0.0, "step": 3064, "time_per_iteration": 2.526881456375122 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106806, "balance_loss_mlp": 1.05822492, "diversity_loss_mlp": 0.0, "epoch": 0.5896498653328203, "flos": 519032489472.0, "grad_norm": 0.06588713514234819, "language_loss": 0.83812523, "learning_rate": 0.0003802956315665771, "loss": 0.84880579, "num_input_tokens_seen": 255718192, "router_z_loss_mlp": 0.0982666, "routerloss_mlp": 0.0, "step": 3065, "time_per_iteration": 2.6691834926605225 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072593, "balance_loss_mlp": 1.06285346, "diversity_loss_mlp": 0.0, "epoch": 0.5898422470180839, "flos": 549050628096.0, "grad_norm": 0.11425397529110681, "language_loss": 0.8185159, "learning_rate": 0.0003799931725145529, "loss": 0.82924175, "num_input_tokens_seen": 255787696, "router_z_loss_mlp": 0.09729004, "routerloss_mlp": 0.0, "step": 3066, "time_per_iteration": 2.6098556518554688 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077112, "balance_loss_mlp": 1.06719375, "diversity_loss_mlp": 0.0, "epoch": 0.5900346287033474, "flos": 524312663040.0, "grad_norm": 0.07983506473752326, "language_loss": 0.85902935, "learning_rate": 0.00037969076005331083, "loss": 0.86980045, "num_input_tokens_seen": 255862992, "router_z_loss_mlp": 0.09912109, "routerloss_mlp": 0.0, "step": 3067, "time_per_iteration": 2.7626185417175293 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081851, "balance_loss_mlp": 1.07184935, "diversity_loss_mlp": 0.0, "epoch": 0.590227010388611, "flos": 567156805632.0, "grad_norm": 0.07247659487205776, "language_loss": 0.8802191, "learning_rate": 0.00037938839430025817, "loss": 0.89103758, "num_input_tokens_seen": 255931872, "router_z_loss_mlp": 0.09997559, "routerloss_mlp": 0.0, "step": 3068, "time_per_iteration": 2.6493396759033203 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088575, "balance_loss_mlp": 1.07886577, "diversity_loss_mlp": 0.0, "epoch": 0.5904193920738746, "flos": 583333208064.0, "grad_norm": 0.0655302097756617, "language_loss": 0.85496283, "learning_rate": 0.0003790860753727835, "loss": 0.8658486, "num_input_tokens_seen": 256004656, "router_z_loss_mlp": 0.09716797, "routerloss_mlp": 0.0, "step": 3069, "time_per_iteration": 2.7941815853118896 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089673, "balance_loss_mlp": 1.07995713, "diversity_loss_mlp": 0.0, "epoch": 0.5906117737591381, "flos": 529701493248.0, "grad_norm": 0.0796849495747384, "language_loss": 0.82864797, "learning_rate": 0.00037878380338825766, "loss": 0.83954477, "num_input_tokens_seen": 256076944, "router_z_loss_mlp": 0.0970459, "routerloss_mlp": 0.0, "step": 3070, "time_per_iteration": 2.6861939430236816 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01102877, "balance_loss_mlp": 1.09311378, "diversity_loss_mlp": 0.0, "epoch": 0.5908041554444017, "flos": 684229151232.0, "grad_norm": 0.08458672700427887, "language_loss": 0.81556624, "learning_rate": 0.00037848157846403287, "loss": 0.82659507, "num_input_tokens_seen": 256154768, "router_z_loss_mlp": 0.09753418, "routerloss_mlp": 0.0, "step": 3071, "time_per_iteration": 2.873662233352661 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101959, "balance_loss_mlp": 1.09236836, "diversity_loss_mlp": 0.0, "epoch": 0.5909965371296653, "flos": 550001746944.0, "grad_norm": 0.07248408902015292, "language_loss": 0.83281767, "learning_rate": 0.0003781794007174435, "loss": 0.84383726, "num_input_tokens_seen": 256230896, "router_z_loss_mlp": 0.09588623, "routerloss_mlp": 0.0, "step": 3072, "time_per_iteration": 2.762472629547119 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088348, "balance_loss_mlp": 1.08360386, "diversity_loss_mlp": 0.0, "epoch": 0.5911889188149289, "flos": 1492361750016.0, "grad_norm": 0.032251872290910595, "language_loss": 0.74074531, "learning_rate": 0.0003778772702658051, "loss": 0.75162888, "num_input_tokens_seen": 256462336, "router_z_loss_mlp": 0.04736328, "routerloss_mlp": 0.0, "step": 3073, "time_per_iteration": 4.854618787765503 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01107188, "balance_loss_mlp": 1.09715033, "diversity_loss_mlp": 0.0, "epoch": 0.5913813005001923, "flos": 487880423424.0, "grad_norm": 0.058981009489694675, "language_loss": 0.80947924, "learning_rate": 0.0003775751872264152, "loss": 0.8205511, "num_input_tokens_seen": 256539376, "router_z_loss_mlp": 0.1003418, "routerloss_mlp": 0.0, "step": 3074, "time_per_iteration": 2.771085023880005 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01101985, "balance_loss_mlp": 1.09195375, "diversity_loss_mlp": 0.0, "epoch": 0.5915736821854559, "flos": 573331198464.0, "grad_norm": 0.056077752757325364, "language_loss": 0.87175214, "learning_rate": 0.0003772731517165527, "loss": 0.88277197, "num_input_tokens_seen": 256617728, "router_z_loss_mlp": 0.10028076, "routerloss_mlp": 0.0, "step": 3075, "time_per_iteration": 2.8292393684387207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01103862, "balance_loss_mlp": 1.09419441, "diversity_loss_mlp": 0.0, "epoch": 0.5917660638707195, "flos": 789518389248.0, "grad_norm": 0.07602524147414737, "language_loss": 0.83311272, "learning_rate": 0.0003769711638534784, "loss": 0.84415126, "num_input_tokens_seen": 256696032, "router_z_loss_mlp": 0.09655762, "routerloss_mlp": 0.0, "step": 3076, "time_per_iteration": 2.97261381149292 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01099488, "balance_loss_mlp": 1.08962953, "diversity_loss_mlp": 0.0, "epoch": 0.5919584455559831, "flos": 528740462592.0, "grad_norm": 0.07287223806238774, "language_loss": 0.79046565, "learning_rate": 0.00037666922375443446, "loss": 0.8014605, "num_input_tokens_seen": 256767360, "router_z_loss_mlp": 0.09851074, "routerloss_mlp": 0.0, "step": 3077, "time_per_iteration": 2.6755480766296387 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01093205, "balance_loss_mlp": 1.08349538, "diversity_loss_mlp": 0.0, "epoch": 0.5921508272412467, "flos": 560606510592.0, "grad_norm": 0.06803693763690793, "language_loss": 0.81907725, "learning_rate": 0.00037636733153664396, "loss": 0.83000934, "num_input_tokens_seen": 256844848, "router_z_loss_mlp": 0.09698486, "routerloss_mlp": 0.0, "step": 3078, "time_per_iteration": 2.8055219650268555 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0109815, "balance_loss_mlp": 1.08854795, "diversity_loss_mlp": 0.0, "epoch": 0.5923432089265102, "flos": 563272662528.0, "grad_norm": 0.08595437511710807, "language_loss": 0.80202127, "learning_rate": 0.0003760654873173124, "loss": 0.81300277, "num_input_tokens_seen": 256916688, "router_z_loss_mlp": 0.09594727, "routerloss_mlp": 0.0, "step": 3079, "time_per_iteration": 2.6700353622436523 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089136, "balance_loss_mlp": 1.07927787, "diversity_loss_mlp": 0.0, "epoch": 0.5925355906117737, "flos": 495740113920.0, "grad_norm": 0.06826446524438025, "language_loss": 0.82043588, "learning_rate": 0.00037576369121362566, "loss": 0.8313272, "num_input_tokens_seen": 256985520, "router_z_loss_mlp": 0.09851074, "routerloss_mlp": 0.0, "step": 3080, "time_per_iteration": 2.596071481704712 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01089019, "balance_loss_mlp": 1.07946444, "diversity_loss_mlp": 0.0, "epoch": 0.5927279722970373, "flos": 566249730048.0, "grad_norm": 0.057614109423291045, "language_loss": 0.81680822, "learning_rate": 0.0003754619433427516, "loss": 0.82769841, "num_input_tokens_seen": 257067552, "router_z_loss_mlp": 0.09558105, "routerloss_mlp": 0.0, "step": 3081, "time_per_iteration": 2.9003093242645264 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087273, "balance_loss_mlp": 1.07771826, "diversity_loss_mlp": 0.0, "epoch": 0.5929203539823009, "flos": 666970578432.0, "grad_norm": 0.09118109008842482, "language_loss": 0.7796042, "learning_rate": 0.0003751602438218392, "loss": 0.79047692, "num_input_tokens_seen": 257138896, "router_z_loss_mlp": 0.09545898, "routerloss_mlp": 0.0, "step": 3082, "time_per_iteration": 2.7739951610565186 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078659, "balance_loss_mlp": 1.06927121, "diversity_loss_mlp": 0.0, "epoch": 0.5931127356675644, "flos": 555744084480.0, "grad_norm": 0.07641398361038237, "language_loss": 0.84107417, "learning_rate": 0.0003748585927680186, "loss": 0.85186076, "num_input_tokens_seen": 257210592, "router_z_loss_mlp": 0.09375, "routerloss_mlp": 0.0, "step": 3083, "time_per_iteration": 2.6706809997558594 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087865, "balance_loss_mlp": 1.07850111, "diversity_loss_mlp": 0.0, "epoch": 0.593305117352828, "flos": 535194210816.0, "grad_norm": 0.07450452823339063, "language_loss": 0.82992828, "learning_rate": 0.00037455699029840086, "loss": 0.84080696, "num_input_tokens_seen": 257276208, "router_z_loss_mlp": 0.09356689, "routerloss_mlp": 0.0, "step": 3084, "time_per_iteration": 2.648775100708008 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082396, "balance_loss_mlp": 1.07310402, "diversity_loss_mlp": 0.0, "epoch": 0.5934974990380916, "flos": 593957795328.0, "grad_norm": 0.0678124296562273, "language_loss": 0.84694779, "learning_rate": 0.0003742554365300787, "loss": 0.85777175, "num_input_tokens_seen": 257351920, "router_z_loss_mlp": 0.09283447, "routerloss_mlp": 0.0, "step": 3085, "time_per_iteration": 2.787437677383423 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00854998, "balance_loss_mlp": 1.4632709, "diversity_loss_mlp": 0.21810779, "epoch": 0.5936898807233552, "flos": 712673220096.0, "grad_norm": 0.030613192067315453, "language_loss": 0.79049134, "learning_rate": 0.0003739539315801255, "loss": 0.79904133, "num_input_tokens_seen": 257430016, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01430825, "step": 3086, "time_per_iteration": 2.9476425647735596 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088902, "balance_loss_mlp": 1.07956231, "diversity_loss_mlp": 0.0, "epoch": 0.5938822624086187, "flos": 391896465408.0, "grad_norm": 0.08021663243926581, "language_loss": 0.91758776, "learning_rate": 0.000373652475565596, "loss": 0.92847675, "num_input_tokens_seen": 257492224, "router_z_loss_mlp": 0.09338379, "routerloss_mlp": 0.0, "step": 3087, "time_per_iteration": 2.473820924758911 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01086482, "balance_loss_mlp": 1.07684994, "diversity_loss_mlp": 0.0, "epoch": 0.5940746440938822, "flos": 480285033984.0, "grad_norm": 0.0746565513598584, "language_loss": 0.81288451, "learning_rate": 0.00037335106860352587, "loss": 0.8237493, "num_input_tokens_seen": 257567824, "router_z_loss_mlp": 0.09625244, "routerloss_mlp": 0.0, "step": 3088, "time_per_iteration": 2.6710119247436523 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085875, "balance_loss_mlp": 1.07624292, "diversity_loss_mlp": 0.0, "epoch": 0.5942670257791458, "flos": 483336626688.0, "grad_norm": 0.06157127364570171, "language_loss": 0.82947195, "learning_rate": 0.00037304971081093146, "loss": 0.84033072, "num_input_tokens_seen": 257635488, "router_z_loss_mlp": 0.09625244, "routerloss_mlp": 0.0, "step": 3089, "time_per_iteration": 2.5530550479888916 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01095759, "balance_loss_mlp": 1.0863055, "diversity_loss_mlp": 0.0, "epoch": 0.5944594074644094, "flos": 547936151040.0, "grad_norm": 0.06188782031055571, "language_loss": 0.80896157, "learning_rate": 0.00037274840230481024, "loss": 0.81991911, "num_input_tokens_seen": 257709552, "router_z_loss_mlp": 0.09448242, "routerloss_mlp": 0.0, "step": 3090, "time_per_iteration": 2.707697868347168 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094547, "balance_loss_mlp": 1.08488476, "diversity_loss_mlp": 0.0, "epoch": 0.594651789149673, "flos": 449179955712.0, "grad_norm": 0.07660649649984981, "language_loss": 0.79309815, "learning_rate": 0.00037244714320214077, "loss": 0.80404359, "num_input_tokens_seen": 257775520, "router_z_loss_mlp": 0.09661865, "routerloss_mlp": 0.0, "step": 3091, "time_per_iteration": 2.524418354034424 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01094365, "balance_loss_mlp": 1.08449435, "diversity_loss_mlp": 0.0, "epoch": 0.5948441708349365, "flos": 596267868672.0, "grad_norm": 0.07189913531932149, "language_loss": 0.83442843, "learning_rate": 0.000372145933619882, "loss": 0.84537208, "num_input_tokens_seen": 257858560, "router_z_loss_mlp": 0.09863281, "routerloss_mlp": 0.0, "step": 3092, "time_per_iteration": 2.889267683029175 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01098289, "balance_loss_mlp": 1.0883646, "diversity_loss_mlp": 0.0, "epoch": 0.5950365525202, "flos": 548516883456.0, "grad_norm": 0.08404319768947686, "language_loss": 0.82928061, "learning_rate": 0.000371844773674974, "loss": 0.84026349, "num_input_tokens_seen": 257928048, "router_z_loss_mlp": 0.09918213, "routerloss_mlp": 0.0, "step": 3093, "time_per_iteration": 2.729433059692383 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00849837, "balance_loss_mlp": 1.45755267, "diversity_loss_mlp": 0.21677493, "epoch": 0.5952289342054636, "flos": 654700340736.0, "grad_norm": 0.03215359042810467, "language_loss": 0.82038867, "learning_rate": 0.0003715436634843375, "loss": 0.82888705, "num_input_tokens_seen": 258003088, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01267278, "step": 3094, "time_per_iteration": 2.8759658336639404 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01110065, "balance_loss_mlp": 1.10049295, "diversity_loss_mlp": 0.0, "epoch": 0.5954213158907272, "flos": 603364018176.0, "grad_norm": 0.05868361705811182, "language_loss": 0.80998492, "learning_rate": 0.00037124260316487355, "loss": 0.82108557, "num_input_tokens_seen": 258084880, "router_z_loss_mlp": 0.09564209, "routerloss_mlp": 0.0, "step": 3095, "time_per_iteration": 2.8515610694885254 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01120202, "balance_loss_mlp": 1.11049807, "diversity_loss_mlp": 0.0, "epoch": 0.5956136975759908, "flos": 486331319808.0, "grad_norm": 0.06311708190042467, "language_loss": 0.89435279, "learning_rate": 0.0003709415928334643, "loss": 0.90555483, "num_input_tokens_seen": 258152032, "router_z_loss_mlp": 0.09698486, "routerloss_mlp": 0.0, "step": 3096, "time_per_iteration": 2.5820794105529785 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00850727, "balance_loss_mlp": 1.45894229, "diversity_loss_mlp": 0.21772251, "epoch": 0.5958060792612543, "flos": 658777204224.0, "grad_norm": 0.03378868601366531, "language_loss": 0.80653715, "learning_rate": 0.00037064063260697233, "loss": 0.81504446, "num_input_tokens_seen": 258228896, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01239414, "step": 3097, "time_per_iteration": 2.897676467895508 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01138893, "balance_loss_mlp": 1.12893891, "diversity_loss_mlp": 0.0, "epoch": 0.5959984609465179, "flos": 723559537152.0, "grad_norm": 0.06769209825818075, "language_loss": 0.78597271, "learning_rate": 0.0003703397226022407, "loss": 0.79736161, "num_input_tokens_seen": 258311152, "router_z_loss_mlp": 0.0994873, "routerloss_mlp": 0.0, "step": 3098, "time_per_iteration": 3.039377212524414 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056672, "balance_loss_mlp": 1.05123568, "diversity_loss_mlp": 0.0, "epoch": 0.5961908426317815, "flos": 1519849557504.0, "grad_norm": 0.0345928166567928, "language_loss": 0.75499874, "learning_rate": 0.00037003886293609335, "loss": 0.76556545, "num_input_tokens_seen": 258540656, "router_z_loss_mlp": 0.05444336, "routerloss_mlp": 0.0, "step": 3099, "time_per_iteration": 4.977718114852905 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00847219, "balance_loss_mlp": 1.45243645, "diversity_loss_mlp": 0.21764749, "epoch": 0.596383224317045, "flos": 532614693888.0, "grad_norm": 0.029968084230811296, "language_loss": 0.83180296, "learning_rate": 0.0003697380537253339, "loss": 0.84027505, "num_input_tokens_seen": 258608960, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01217673, "step": 3100, "time_per_iteration": 2.673551559448242 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01121175, "balance_loss_mlp": 1.11119175, "diversity_loss_mlp": 0.0, "epoch": 0.5965756060023086, "flos": 591210150912.0, "grad_norm": 0.06630352939366652, "language_loss": 0.81596649, "learning_rate": 0.0003694372950867471, "loss": 0.82717824, "num_input_tokens_seen": 258684304, "router_z_loss_mlp": 0.09979248, "routerloss_mlp": 0.0, "step": 3101, "time_per_iteration": 2.7776670455932617 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01119741, "balance_loss_mlp": 1.1100198, "diversity_loss_mlp": 0.0, "epoch": 0.5967679876875721, "flos": 862054327296.0, "grad_norm": 0.07189145573728124, "language_loss": 0.77408171, "learning_rate": 0.0003691365871370976, "loss": 0.78527915, "num_input_tokens_seen": 258769472, "router_z_loss_mlp": 0.09710693, "routerloss_mlp": 0.0, "step": 3102, "time_per_iteration": 3.04355525970459 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01116521, "balance_loss_mlp": 1.1067102, "diversity_loss_mlp": 0.0, "epoch": 0.5969603693728357, "flos": 553834132992.0, "grad_norm": 0.06839859357083694, "language_loss": 0.8504554, "learning_rate": 0.00036883592999313093, "loss": 0.8616206, "num_input_tokens_seen": 258841696, "router_z_loss_mlp": 0.09802246, "routerloss_mlp": 0.0, "step": 3103, "time_per_iteration": 2.6881608963012695 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01111468, "balance_loss_mlp": 1.1020087, "diversity_loss_mlp": 0.0, "epoch": 0.5971527510580993, "flos": 718662606336.0, "grad_norm": 0.07720585150601726, "language_loss": 0.7960434, "learning_rate": 0.0003685353237715722, "loss": 0.80715817, "num_input_tokens_seen": 258915616, "router_z_loss_mlp": 0.09448242, "routerloss_mlp": 0.0, "step": 3104, "time_per_iteration": 2.910879135131836 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01104035, "balance_loss_mlp": 1.09433126, "diversity_loss_mlp": 0.0, "epoch": 0.5973451327433629, "flos": 647631355392.0, "grad_norm": 0.08349083770410728, "language_loss": 0.81658864, "learning_rate": 0.0003682347685891274, "loss": 0.82762903, "num_input_tokens_seen": 258994080, "router_z_loss_mlp": 0.09698486, "routerloss_mlp": 0.0, "step": 3105, "time_per_iteration": 2.8556530475616455 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01093856, "balance_loss_mlp": 1.08412814, "diversity_loss_mlp": 0.0, "epoch": 0.5975375144286263, "flos": 721716397056.0, "grad_norm": 0.07861180875636395, "language_loss": 0.80587226, "learning_rate": 0.0003679342645624822, "loss": 0.81681079, "num_input_tokens_seen": 259075968, "router_z_loss_mlp": 0.09716797, "routerloss_mlp": 0.0, "step": 3106, "time_per_iteration": 2.9788949489593506 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091288, "balance_loss_mlp": 1.08144689, "diversity_loss_mlp": 0.0, "epoch": 0.5977298961138899, "flos": 750961082880.0, "grad_norm": 0.062123999367099406, "language_loss": 0.81345969, "learning_rate": 0.0003676338118083025, "loss": 0.82437259, "num_input_tokens_seen": 259162512, "router_z_loss_mlp": 0.09832764, "routerloss_mlp": 0.0, "step": 3107, "time_per_iteration": 3.0514276027679443 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083265, "balance_loss_mlp": 1.07369304, "diversity_loss_mlp": 0.0, "epoch": 0.5979222777991535, "flos": 530961702912.0, "grad_norm": 0.07200241428310707, "language_loss": 0.79341209, "learning_rate": 0.0003673334104432347, "loss": 0.8042447, "num_input_tokens_seen": 259228752, "router_z_loss_mlp": 0.09564209, "routerloss_mlp": 0.0, "step": 3108, "time_per_iteration": 2.6402766704559326 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084433, "balance_loss_mlp": 1.07493854, "diversity_loss_mlp": 0.0, "epoch": 0.5981146594844171, "flos": 621749551104.0, "grad_norm": 0.06431634181531254, "language_loss": 0.83437502, "learning_rate": 0.0003670330605839048, "loss": 0.84521937, "num_input_tokens_seen": 259303440, "router_z_loss_mlp": 0.09484863, "routerloss_mlp": 0.0, "step": 3109, "time_per_iteration": 2.8350021839141846 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071839, "balance_loss_mlp": 1.06252289, "diversity_loss_mlp": 0.0, "epoch": 0.5983070411696807, "flos": 603589045248.0, "grad_norm": 0.08338826074003908, "language_loss": 0.76629049, "learning_rate": 0.0003667327623469191, "loss": 0.77700889, "num_input_tokens_seen": 259378752, "router_z_loss_mlp": 0.09307861, "routerloss_mlp": 0.0, "step": 3110, "time_per_iteration": 2.7434427738189697 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01086579, "balance_loss_mlp": 1.0770725, "diversity_loss_mlp": 0.0, "epoch": 0.5984994228549442, "flos": 633483472896.0, "grad_norm": 0.07334566089126898, "language_loss": 0.7758621, "learning_rate": 0.00036643251584886333, "loss": 0.78672791, "num_input_tokens_seen": 259454336, "router_z_loss_mlp": 0.09503174, "routerloss_mlp": 0.0, "step": 3111, "time_per_iteration": 2.7712619304656982 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080276, "balance_loss_mlp": 1.07075715, "diversity_loss_mlp": 0.0, "epoch": 0.5986918045402078, "flos": 525278836224.0, "grad_norm": 0.0661546294312284, "language_loss": 0.81729323, "learning_rate": 0.00036613232120630393, "loss": 0.82809597, "num_input_tokens_seen": 259518960, "router_z_loss_mlp": 0.09503174, "routerloss_mlp": 0.0, "step": 3112, "time_per_iteration": 2.6437926292419434 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077999, "balance_loss_mlp": 1.06822348, "diversity_loss_mlp": 0.0, "epoch": 0.5988841862254713, "flos": 483180982272.0, "grad_norm": 0.09952194732663294, "language_loss": 0.80305058, "learning_rate": 0.00036583217853578643, "loss": 0.81383061, "num_input_tokens_seen": 259584352, "router_z_loss_mlp": 0.09771729, "routerloss_mlp": 0.0, "step": 3113, "time_per_iteration": 2.5917038917541504 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085265, "balance_loss_mlp": 1.07562053, "diversity_loss_mlp": 0.0, "epoch": 0.5990765679107349, "flos": 1140149924352.0, "grad_norm": 0.09394979208953491, "language_loss": 0.77671385, "learning_rate": 0.000365532087953837, "loss": 0.78756654, "num_input_tokens_seen": 259693152, "router_z_loss_mlp": 0.09631348, "routerloss_mlp": 0.0, "step": 3114, "time_per_iteration": 3.6197850704193115 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075561, "balance_loss_mlp": 1.06598282, "diversity_loss_mlp": 0.0, "epoch": 0.5992689495959984, "flos": 516986717184.0, "grad_norm": 0.08322265150120763, "language_loss": 0.89675403, "learning_rate": 0.00036523204957696065, "loss": 0.90750962, "num_input_tokens_seen": 259762048, "router_z_loss_mlp": 0.09564209, "routerloss_mlp": 0.0, "step": 3115, "time_per_iteration": 2.5928850173950195 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068785, "balance_loss_mlp": 1.05900383, "diversity_loss_mlp": 0.0, "epoch": 0.599461331281262, "flos": 744618562560.0, "grad_norm": 0.07018475264035358, "language_loss": 0.80565965, "learning_rate": 0.00036493206352164324, "loss": 0.81634748, "num_input_tokens_seen": 259843184, "router_z_loss_mlp": 0.09771729, "routerloss_mlp": 0.0, "step": 3116, "time_per_iteration": 2.9302330017089844 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070118, "balance_loss_mlp": 1.06046212, "diversity_loss_mlp": 0.0, "epoch": 0.5996537129665256, "flos": 592359132672.0, "grad_norm": 0.07338463965566117, "language_loss": 0.85090643, "learning_rate": 0.000364632129904349, "loss": 0.86160767, "num_input_tokens_seen": 259912720, "router_z_loss_mlp": 0.09643555, "routerloss_mlp": 0.0, "step": 3117, "time_per_iteration": 2.7801764011383057 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072158, "balance_loss_mlp": 1.0622344, "diversity_loss_mlp": 0.0, "epoch": 0.5998460946517892, "flos": 559010419200.0, "grad_norm": 0.06545944211786243, "language_loss": 0.78013116, "learning_rate": 0.00036433224884152283, "loss": 0.79085279, "num_input_tokens_seen": 259985472, "router_z_loss_mlp": 0.0993042, "routerloss_mlp": 0.0, "step": 3118, "time_per_iteration": 2.714756727218628 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107233, "balance_loss_mlp": 1.06249511, "diversity_loss_mlp": 0.0, "epoch": 0.6000384763370528, "flos": 484567100928.0, "grad_norm": 0.08041065589047977, "language_loss": 0.77752131, "learning_rate": 0.00036403242044958875, "loss": 0.78824466, "num_input_tokens_seen": 260050336, "router_z_loss_mlp": 0.09832764, "routerloss_mlp": 0.0, "step": 3119, "time_per_iteration": 2.583292245864868 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078089, "balance_loss_mlp": 1.06846261, "diversity_loss_mlp": 0.0, "epoch": 0.6002308580223162, "flos": 596767108608.0, "grad_norm": 0.07420053325288596, "language_loss": 0.91699272, "learning_rate": 0.0003637326448449507, "loss": 0.92777365, "num_input_tokens_seen": 260120304, "router_z_loss_mlp": 0.09619141, "routerloss_mlp": 0.0, "step": 3120, "time_per_iteration": 2.717006206512451 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080309, "balance_loss_mlp": 1.07065916, "diversity_loss_mlp": 0.0, "epoch": 0.6004232397075798, "flos": 545146661376.0, "grad_norm": 0.053625374444117885, "language_loss": 0.86324787, "learning_rate": 0.00036343292214399177, "loss": 0.87405097, "num_input_tokens_seen": 260198304, "router_z_loss_mlp": 0.09643555, "routerloss_mlp": 0.0, "step": 3121, "time_per_iteration": 2.7628395557403564 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092008, "balance_loss_mlp": 1.08205438, "diversity_loss_mlp": 0.0, "epoch": 0.6006156213928434, "flos": 629947694592.0, "grad_norm": 0.08110417303016995, "language_loss": 0.77154052, "learning_rate": 0.00036313325246307456, "loss": 0.78246063, "num_input_tokens_seen": 260277664, "router_z_loss_mlp": 0.0994873, "routerloss_mlp": 0.0, "step": 3122, "time_per_iteration": 2.7920055389404297 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01097808, "balance_loss_mlp": 1.08813453, "diversity_loss_mlp": 0.0, "epoch": 0.600808003078107, "flos": 582315277824.0, "grad_norm": 0.07750521229706399, "language_loss": 0.87508434, "learning_rate": 0.0003628336359185411, "loss": 0.88606238, "num_input_tokens_seen": 260350096, "router_z_loss_mlp": 0.09667969, "routerloss_mlp": 0.0, "step": 3123, "time_per_iteration": 2.6752257347106934 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01086195, "balance_loss_mlp": 1.07632422, "diversity_loss_mlp": 0.0, "epoch": 0.6010003847633705, "flos": 635274855936.0, "grad_norm": 0.09005007447476754, "language_loss": 0.75524527, "learning_rate": 0.000362534072626713, "loss": 0.7661072, "num_input_tokens_seen": 260421888, "router_z_loss_mlp": 0.09863281, "routerloss_mlp": 0.0, "step": 3124, "time_per_iteration": 2.7923338413238525 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077955, "balance_loss_mlp": 1.06818557, "diversity_loss_mlp": 0.0, "epoch": 0.6011927664486341, "flos": 718763922432.0, "grad_norm": 0.07223530633843779, "language_loss": 0.81714958, "learning_rate": 0.00036223456270389093, "loss": 0.82792914, "num_input_tokens_seen": 260499616, "router_z_loss_mlp": 0.09759521, "routerloss_mlp": 0.0, "step": 3125, "time_per_iteration": 3.0091912746429443 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075718, "balance_loss_mlp": 1.06540036, "diversity_loss_mlp": 0.0, "epoch": 0.6013851481338977, "flos": 499036184064.0, "grad_norm": 0.06403369467156497, "language_loss": 0.80792087, "learning_rate": 0.00036193510626635517, "loss": 0.81867802, "num_input_tokens_seen": 260572048, "router_z_loss_mlp": 0.10321045, "routerloss_mlp": 0.0, "step": 3126, "time_per_iteration": 2.704378843307495 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066992, "balance_loss_mlp": 1.05687714, "diversity_loss_mlp": 0.0, "epoch": 0.6015775298191612, "flos": 749587447296.0, "grad_norm": 0.06193993783441067, "language_loss": 0.81725299, "learning_rate": 0.0003616357034303649, "loss": 0.82792288, "num_input_tokens_seen": 260644720, "router_z_loss_mlp": 0.10113525, "routerloss_mlp": 0.0, "step": 3127, "time_per_iteration": 3.002530813217163 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062328, "balance_loss_mlp": 1.05243957, "diversity_loss_mlp": 0.0, "epoch": 0.6017699115044248, "flos": 593063202816.0, "grad_norm": 0.054941683840542065, "language_loss": 0.78751493, "learning_rate": 0.0003613363543121584, "loss": 0.79813826, "num_input_tokens_seen": 260724864, "router_z_loss_mlp": 0.09887695, "routerloss_mlp": 0.0, "step": 3128, "time_per_iteration": 2.8690690994262695 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063837, "balance_loss_mlp": 1.05367482, "diversity_loss_mlp": 0.0, "epoch": 0.6019622931896883, "flos": 515111270400.0, "grad_norm": 0.06760978748019858, "language_loss": 0.85022873, "learning_rate": 0.00036103705902795357, "loss": 0.86086708, "num_input_tokens_seen": 260800896, "router_z_loss_mlp": 0.10162354, "routerloss_mlp": 0.0, "step": 3129, "time_per_iteration": 2.7233073711395264 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106265, "balance_loss_mlp": 1.0526309, "diversity_loss_mlp": 0.0, "epoch": 0.6021546748749519, "flos": 490469852160.0, "grad_norm": 0.08999540715217709, "language_loss": 0.79606092, "learning_rate": 0.0003607378176939471, "loss": 0.80668741, "num_input_tokens_seen": 260872736, "router_z_loss_mlp": 0.10015869, "routerloss_mlp": 0.0, "step": 3130, "time_per_iteration": 2.6465327739715576 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060318, "balance_loss_mlp": 1.0503943, "diversity_loss_mlp": 0.0, "epoch": 0.6023470565602155, "flos": 541032721920.0, "grad_norm": 0.0812918345139536, "language_loss": 0.82358718, "learning_rate": 0.00036043863042631465, "loss": 0.83419037, "num_input_tokens_seen": 260943264, "router_z_loss_mlp": 0.09918213, "routerloss_mlp": 0.0, "step": 3131, "time_per_iteration": 2.645275354385376 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060921, "balance_loss_mlp": 1.05113363, "diversity_loss_mlp": 0.0, "epoch": 0.6025394382454791, "flos": 845020408320.0, "grad_norm": 0.07968064937120022, "language_loss": 0.7648955, "learning_rate": 0.00036013949734121133, "loss": 0.77550471, "num_input_tokens_seen": 261030064, "router_z_loss_mlp": 0.09777832, "routerloss_mlp": 0.0, "step": 3132, "time_per_iteration": 3.1564602851867676 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00847858, "balance_loss_mlp": 1.44895816, "diversity_loss_mlp": 0.22101411, "epoch": 0.6027318199307425, "flos": 577173496320.0, "grad_norm": 0.03213509913040014, "language_loss": 0.82544625, "learning_rate": 0.00035984041855477043, "loss": 0.83392477, "num_input_tokens_seen": 261106496, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01287225, "step": 3133, "time_per_iteration": 2.7710041999816895 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00606016, "balance_loss_mlp": 1.03831875, "diversity_loss_mlp": 0.14934492, "epoch": 0.6029242016160061, "flos": 1470976754688.0, "grad_norm": 0.0016585081527992916, "language_loss": 0.78709894, "learning_rate": 0.00035954139418310495, "loss": 0.79315913, "num_input_tokens_seen": 261343248, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01218408, "step": 3134, "time_per_iteration": 5.010243892669678 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058814, "balance_loss_mlp": 1.04887819, "diversity_loss_mlp": 0.0, "epoch": 0.6031165833012697, "flos": 480744626688.0, "grad_norm": 0.06935738535706247, "language_loss": 0.79867685, "learning_rate": 0.00035924242434230637, "loss": 0.80926502, "num_input_tokens_seen": 261416704, "router_z_loss_mlp": 0.0993042, "routerloss_mlp": 0.0, "step": 3135, "time_per_iteration": 2.644461154937744 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059705, "balance_loss_mlp": 1.04970384, "diversity_loss_mlp": 0.0, "epoch": 0.6033089649865333, "flos": 499468612608.0, "grad_norm": 0.08930778928911463, "language_loss": 0.78960454, "learning_rate": 0.00035894350914844516, "loss": 0.80020154, "num_input_tokens_seen": 261486688, "router_z_loss_mlp": 0.09997559, "routerloss_mlp": 0.0, "step": 3136, "time_per_iteration": 2.6219546794891357 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060882, "balance_loss_mlp": 1.05073738, "diversity_loss_mlp": 0.0, "epoch": 0.6035013466717969, "flos": 556613710848.0, "grad_norm": 0.07477991129212373, "language_loss": 0.82716846, "learning_rate": 0.0003586446487175703, "loss": 0.83777732, "num_input_tokens_seen": 261557344, "router_z_loss_mlp": 0.10137939, "routerloss_mlp": 0.0, "step": 3137, "time_per_iteration": 2.7377843856811523 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057253, "balance_loss_mlp": 1.04716182, "diversity_loss_mlp": 0.0, "epoch": 0.6036937283570604, "flos": 594827421696.0, "grad_norm": 0.06084036951856249, "language_loss": 0.85439289, "learning_rate": 0.0003583458431657099, "loss": 0.86496538, "num_input_tokens_seen": 261626240, "router_z_loss_mlp": 0.10089111, "routerloss_mlp": 0.0, "step": 3138, "time_per_iteration": 2.773810863494873 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056899, "balance_loss_mlp": 1.04697502, "diversity_loss_mlp": 0.0, "epoch": 0.603886110042324, "flos": 540958569984.0, "grad_norm": 0.10358798927054172, "language_loss": 0.82887417, "learning_rate": 0.00035804709260887056, "loss": 0.83944315, "num_input_tokens_seen": 261696368, "router_z_loss_mlp": 0.09924316, "routerloss_mlp": 0.0, "step": 3139, "time_per_iteration": 2.7064261436462402 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0084935, "balance_loss_mlp": 1.45506001, "diversity_loss_mlp": 0.21838406, "epoch": 0.6040784917275875, "flos": 518582808576.0, "grad_norm": 0.02792942393132789, "language_loss": 0.89382195, "learning_rate": 0.0003577483971630373, "loss": 0.9023155, "num_input_tokens_seen": 261769104, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01262751, "step": 3140, "time_per_iteration": 2.747962236404419 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063532, "balance_loss_mlp": 1.053352, "diversity_loss_mlp": 0.0, "epoch": 0.6042708734128511, "flos": 660751395840.0, "grad_norm": 0.05833739987767841, "language_loss": 0.84937215, "learning_rate": 0.00035744975694417414, "loss": 0.86000752, "num_input_tokens_seen": 261844880, "router_z_loss_mlp": 0.10180664, "routerloss_mlp": 0.0, "step": 3141, "time_per_iteration": 2.886625289916992 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060532, "balance_loss_mlp": 1.05025589, "diversity_loss_mlp": 0.0, "epoch": 0.6044632550981146, "flos": 572330520576.0, "grad_norm": 0.07799366016494108, "language_loss": 0.82322264, "learning_rate": 0.00035715117206822344, "loss": 0.83382797, "num_input_tokens_seen": 261923280, "router_z_loss_mlp": 0.1027832, "routerloss_mlp": 0.0, "step": 3142, "time_per_iteration": 2.8120434284210205 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061407, "balance_loss_mlp": 1.05125666, "diversity_loss_mlp": 0.0, "epoch": 0.6046556367833782, "flos": 546681083904.0, "grad_norm": 0.06292121779847899, "language_loss": 0.80965286, "learning_rate": 0.0003568526426511065, "loss": 0.82026696, "num_input_tokens_seen": 261990832, "router_z_loss_mlp": 0.10150146, "routerloss_mlp": 0.0, "step": 3143, "time_per_iteration": 2.600508689880371 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00857497, "balance_loss_mlp": 1.4695704, "diversity_loss_mlp": 0.22092447, "epoch": 0.6048480184686418, "flos": 776838117888.0, "grad_norm": 0.033476134745844106, "language_loss": 0.83131814, "learning_rate": 0.000356554168808722, "loss": 0.8398931, "num_input_tokens_seen": 262063760, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0122495, "step": 3144, "time_per_iteration": 3.026810646057129 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106893, "balance_loss_mlp": 1.058887, "diversity_loss_mlp": 0.0, "epoch": 0.6050404001539054, "flos": 657144036864.0, "grad_norm": 0.07082652980877534, "language_loss": 0.85014772, "learning_rate": 0.00035625575065694837, "loss": 0.86083698, "num_input_tokens_seen": 262137968, "router_z_loss_mlp": 0.10040283, "routerloss_mlp": 0.0, "step": 3145, "time_per_iteration": 2.840867519378662 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00845224, "balance_loss_mlp": 1.44920301, "diversity_loss_mlp": 0.21683007, "epoch": 0.605232781839169, "flos": 548983816704.0, "grad_norm": 0.03030378734616264, "language_loss": 0.77627134, "learning_rate": 0.0003559573883116415, "loss": 0.78472358, "num_input_tokens_seen": 262211264, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01220777, "step": 3146, "time_per_iteration": 2.7349908351898193 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107352, "balance_loss_mlp": 1.06324959, "diversity_loss_mlp": 0.0, "epoch": 0.6054251635244324, "flos": 605402449920.0, "grad_norm": 0.05605665058846549, "language_loss": 0.85758018, "learning_rate": 0.00035565908188863604, "loss": 0.86831534, "num_input_tokens_seen": 262289648, "router_z_loss_mlp": 0.10272217, "routerloss_mlp": 0.0, "step": 3147, "time_per_iteration": 2.8125319480895996 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00845087, "balance_loss_mlp": 1.44807422, "diversity_loss_mlp": 0.21802135, "epoch": 0.605617545209696, "flos": 613679887872.0, "grad_norm": 0.03003998541469304, "language_loss": 0.79795343, "learning_rate": 0.00035536083150374464, "loss": 0.80640435, "num_input_tokens_seen": 262362704, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01203923, "step": 3148, "time_per_iteration": 2.8052470684051514 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01017561, "balance_loss_mlp": 1.01191068, "diversity_loss_mlp": 0.0, "epoch": 0.6058099268949596, "flos": 1498301577216.0, "grad_norm": 0.017174605961616223, "language_loss": 0.74747956, "learning_rate": 0.00035506263727275893, "loss": 0.75765514, "num_input_tokens_seen": 262596864, "router_z_loss_mlp": 0.05639648, "routerloss_mlp": 0.0, "step": 3149, "time_per_iteration": 4.839694023132324 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068624, "balance_loss_mlp": 1.05813408, "diversity_loss_mlp": 0.0, "epoch": 0.6060023085802232, "flos": 670476621312.0, "grad_norm": 0.07659984741592324, "language_loss": 0.86092103, "learning_rate": 0.0003547644993114475, "loss": 0.87160718, "num_input_tokens_seen": 262671088, "router_z_loss_mlp": 0.10491943, "routerloss_mlp": 0.0, "step": 3150, "time_per_iteration": 2.847841739654541 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072042, "balance_loss_mlp": 1.06145024, "diversity_loss_mlp": 0.0, "epoch": 0.6061946902654868, "flos": 606168562176.0, "grad_norm": 0.11052058943541425, "language_loss": 0.79770887, "learning_rate": 0.00035446641773555806, "loss": 0.80842924, "num_input_tokens_seen": 262743888, "router_z_loss_mlp": 0.10595703, "routerloss_mlp": 0.0, "step": 3151, "time_per_iteration": 2.748117208480835 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068453, "balance_loss_mlp": 1.05804002, "diversity_loss_mlp": 0.0, "epoch": 0.6063870719507503, "flos": 557844185088.0, "grad_norm": 0.06928200582264574, "language_loss": 0.87033039, "learning_rate": 0.000354168392660816, "loss": 0.88101488, "num_input_tokens_seen": 262819616, "router_z_loss_mlp": 0.10412598, "routerloss_mlp": 0.0, "step": 3152, "time_per_iteration": 2.7237491607666016 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064757, "balance_loss_mlp": 1.05449951, "diversity_loss_mlp": 0.0, "epoch": 0.6065794536360138, "flos": 557154796032.0, "grad_norm": 0.08776252561897581, "language_loss": 0.83035654, "learning_rate": 0.0003538704242029252, "loss": 0.84100413, "num_input_tokens_seen": 262893984, "router_z_loss_mlp": 0.1026001, "routerloss_mlp": 0.0, "step": 3153, "time_per_iteration": 2.687469959259033 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064416, "balance_loss_mlp": 1.05382478, "diversity_loss_mlp": 0.0, "epoch": 0.6067718353212774, "flos": 690144385536.0, "grad_norm": 0.06996316305541914, "language_loss": 0.78274238, "learning_rate": 0.0003535725124775672, "loss": 0.79338652, "num_input_tokens_seen": 262969648, "router_z_loss_mlp": 0.105896, "routerloss_mlp": 0.0, "step": 3154, "time_per_iteration": 2.844794750213623 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056628, "balance_loss_mlp": 1.04631591, "diversity_loss_mlp": 0.0, "epoch": 0.606964217006541, "flos": 521804726784.0, "grad_norm": 0.06399916678040601, "language_loss": 0.86628783, "learning_rate": 0.00035327465760040126, "loss": 0.87685412, "num_input_tokens_seen": 263042048, "router_z_loss_mlp": 0.10314941, "routerloss_mlp": 0.0, "step": 3155, "time_per_iteration": 2.7096383571624756 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049685, "balance_loss_mlp": 1.03957009, "diversity_loss_mlp": 0.0, "epoch": 0.6071565986918045, "flos": 641555707392.0, "grad_norm": 0.08275092128409181, "language_loss": 0.84610963, "learning_rate": 0.00035297685968706526, "loss": 0.85660648, "num_input_tokens_seen": 263108032, "router_z_loss_mlp": 0.10113525, "routerloss_mlp": 0.0, "step": 3156, "time_per_iteration": 2.770024061203003 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054018, "balance_loss_mlp": 1.04370594, "diversity_loss_mlp": 0.0, "epoch": 0.6073489803770681, "flos": 560581917696.0, "grad_norm": 0.07863496537101755, "language_loss": 0.83056825, "learning_rate": 0.00035267911885317454, "loss": 0.84110844, "num_input_tokens_seen": 263175184, "router_z_loss_mlp": 0.10314941, "routerloss_mlp": 0.0, "step": 3157, "time_per_iteration": 2.671334743499756 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050585, "balance_loss_mlp": 1.04051757, "diversity_loss_mlp": 0.0, "epoch": 0.6075413620623317, "flos": 586088193024.0, "grad_norm": 0.06000790250856451, "language_loss": 0.81843442, "learning_rate": 0.0003523814352143222, "loss": 0.82894027, "num_input_tokens_seen": 263252768, "router_z_loss_mlp": 0.10064697, "routerloss_mlp": 0.0, "step": 3158, "time_per_iteration": 2.820080518722534 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053512, "balance_loss_mlp": 1.04349208, "diversity_loss_mlp": 0.0, "epoch": 0.6077337437475953, "flos": 630812551680.0, "grad_norm": 0.0842902191025903, "language_loss": 0.91154212, "learning_rate": 0.00035208380888607937, "loss": 0.92207724, "num_input_tokens_seen": 263328720, "router_z_loss_mlp": 0.10015869, "routerloss_mlp": 0.0, "step": 3159, "time_per_iteration": 2.769655466079712 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0102985, "balance_loss_mlp": 1.02448559, "diversity_loss_mlp": 0.0, "epoch": 0.6079261254328588, "flos": 1468503696384.0, "grad_norm": 0.01971528727847153, "language_loss": 0.79461986, "learning_rate": 0.000351786239983995, "loss": 0.80491835, "num_input_tokens_seen": 263554656, "router_z_loss_mlp": 0.05371094, "routerloss_mlp": 0.0, "step": 3160, "time_per_iteration": 4.852057933807373 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01020567, "balance_loss_mlp": 1.015203, "diversity_loss_mlp": 0.0, "epoch": 0.6081185071181223, "flos": 1523024861184.0, "grad_norm": 0.015706814795434412, "language_loss": 0.7569223, "learning_rate": 0.00035148872862359517, "loss": 0.76712799, "num_input_tokens_seen": 263791600, "router_z_loss_mlp": 0.05371094, "routerloss_mlp": 0.0, "step": 3161, "time_per_iteration": 5.034492015838623 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105269, "balance_loss_mlp": 1.04277158, "diversity_loss_mlp": 0.0, "epoch": 0.6083108888033859, "flos": 556319674368.0, "grad_norm": 0.07240231538807727, "language_loss": 0.82060492, "learning_rate": 0.00035119127492038446, "loss": 0.83113182, "num_input_tokens_seen": 263869744, "router_z_loss_mlp": 0.09912109, "routerloss_mlp": 0.0, "step": 3162, "time_per_iteration": 2.7958009243011475 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058888, "balance_loss_mlp": 1.04918981, "diversity_loss_mlp": 0.0, "epoch": 0.6085032704886495, "flos": 841166000640.0, "grad_norm": 0.08243185287386566, "language_loss": 0.8267377, "learning_rate": 0.00035089387898984436, "loss": 0.83732659, "num_input_tokens_seen": 263946624, "router_z_loss_mlp": 0.09692383, "routerloss_mlp": 0.0, "step": 3163, "time_per_iteration": 3.0141196250915527 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106001, "balance_loss_mlp": 1.04982388, "diversity_loss_mlp": 0.0, "epoch": 0.6086956521739131, "flos": 684792631296.0, "grad_norm": 0.07404044041946549, "language_loss": 0.81452298, "learning_rate": 0.0003505965409474343, "loss": 0.82512313, "num_input_tokens_seen": 264022064, "router_z_loss_mlp": 0.10186768, "routerloss_mlp": 0.0, "step": 3164, "time_per_iteration": 2.884279727935791 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00822199, "balance_loss_mlp": 1.40056133, "diversity_loss_mlp": 0.21809974, "epoch": 0.6088880338591766, "flos": 535799536128.0, "grad_norm": 0.02989314006565827, "language_loss": 0.86555362, "learning_rate": 0.0003502992609085913, "loss": 0.8737756, "num_input_tokens_seen": 264089520, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01286863, "step": 3165, "time_per_iteration": 2.665219306945801 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064479, "balance_loss_mlp": 1.05481732, "diversity_loss_mlp": 0.0, "epoch": 0.6090804155444401, "flos": 731533026816.0, "grad_norm": 0.0721176964117247, "language_loss": 0.82392001, "learning_rate": 0.00035000203898872954, "loss": 0.83456486, "num_input_tokens_seen": 264173056, "router_z_loss_mlp": 0.09649658, "routerloss_mlp": 0.0, "step": 3166, "time_per_iteration": 3.0119569301605225 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064027, "balance_loss_mlp": 1.05416799, "diversity_loss_mlp": 0.0, "epoch": 0.6092727972297037, "flos": 699014665728.0, "grad_norm": 0.07129548452914211, "language_loss": 0.84480536, "learning_rate": 0.0003497048753032406, "loss": 0.85544562, "num_input_tokens_seen": 264250912, "router_z_loss_mlp": 0.09857178, "routerloss_mlp": 0.0, "step": 3167, "time_per_iteration": 2.854588031768799 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069841, "balance_loss_mlp": 1.05985689, "diversity_loss_mlp": 0.0, "epoch": 0.6094651789149673, "flos": 1051946735616.0, "grad_norm": 0.07231997141892146, "language_loss": 0.80835009, "learning_rate": 0.000349407769967494, "loss": 0.8190484, "num_input_tokens_seen": 264342800, "router_z_loss_mlp": 0.09979248, "routerloss_mlp": 0.0, "step": 3168, "time_per_iteration": 3.3936102390289307 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072291, "balance_loss_mlp": 1.06240892, "diversity_loss_mlp": 0.0, "epoch": 0.6096575606002309, "flos": 503085883392.0, "grad_norm": 0.08318926372150726, "language_loss": 0.8467539, "learning_rate": 0.0003491107230968361, "loss": 0.85747683, "num_input_tokens_seen": 264413664, "router_z_loss_mlp": 0.09881592, "routerloss_mlp": 0.0, "step": 3169, "time_per_iteration": 2.618696928024292 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070277, "balance_loss_mlp": 1.06021023, "diversity_loss_mlp": 0.0, "epoch": 0.6098499422854944, "flos": 585643281408.0, "grad_norm": 0.06713277413300113, "language_loss": 0.81751496, "learning_rate": 0.00034881373480659085, "loss": 0.82821774, "num_input_tokens_seen": 264494944, "router_z_loss_mlp": 0.10064697, "routerloss_mlp": 0.0, "step": 3170, "time_per_iteration": 2.862299919128418 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063164, "balance_loss_mlp": 1.05321598, "diversity_loss_mlp": 0.0, "epoch": 0.610042323970758, "flos": 469205996544.0, "grad_norm": 0.08200914133790435, "language_loss": 0.77840459, "learning_rate": 0.0003485168052120594, "loss": 0.78903627, "num_input_tokens_seen": 264561664, "router_z_loss_mlp": 0.09942627, "routerloss_mlp": 0.0, "step": 3171, "time_per_iteration": 2.564657688140869 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060206, "balance_loss_mlp": 1.05049598, "diversity_loss_mlp": 0.0, "epoch": 0.6102347056560216, "flos": 514177403904.0, "grad_norm": 0.07281146068818606, "language_loss": 0.80045426, "learning_rate": 0.00034821993442851973, "loss": 0.81105626, "num_input_tokens_seen": 264626256, "router_z_loss_mlp": 0.0970459, "routerloss_mlp": 0.0, "step": 3172, "time_per_iteration": 2.6049551963806152 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058405, "balance_loss_mlp": 1.04840922, "diversity_loss_mlp": 0.0, "epoch": 0.6104270873412851, "flos": 469013276160.0, "grad_norm": 0.08175384117022455, "language_loss": 0.82176208, "learning_rate": 0.00034792312257122735, "loss": 0.83234608, "num_input_tokens_seen": 264692768, "router_z_loss_mlp": 0.09991455, "routerloss_mlp": 0.0, "step": 3173, "time_per_iteration": 2.6007068157196045 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00813523, "balance_loss_mlp": 1.38556361, "diversity_loss_mlp": 0.21673629, "epoch": 0.6106194690265486, "flos": 549875837952.0, "grad_norm": 0.0335182000566727, "language_loss": 0.80848879, "learning_rate": 0.00034762636975541506, "loss": 0.81662405, "num_input_tokens_seen": 264764816, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01237353, "step": 3174, "time_per_iteration": 2.6783013343811035 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061612, "balance_loss_mlp": 1.05138397, "diversity_loss_mlp": 0.0, "epoch": 0.6108118507118122, "flos": 472857772032.0, "grad_norm": 0.07909505551334972, "language_loss": 0.81032109, "learning_rate": 0.0003473296760962923, "loss": 0.82093716, "num_input_tokens_seen": 264837968, "router_z_loss_mlp": 0.10229492, "routerloss_mlp": 0.0, "step": 3175, "time_per_iteration": 2.7157249450683594 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01017221, "balance_loss_mlp": 1.01159382, "diversity_loss_mlp": 0.0, "epoch": 0.6110042323970758, "flos": 1445166904320.0, "grad_norm": 0.020158265394599716, "language_loss": 0.78533739, "learning_rate": 0.00034703304170904617, "loss": 0.79550958, "num_input_tokens_seen": 265058336, "router_z_loss_mlp": 0.05615234, "routerloss_mlp": 0.0, "step": 3176, "time_per_iteration": 4.707489728927612 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059125, "balance_loss_mlp": 1.04915345, "diversity_loss_mlp": 0.0, "epoch": 0.6111966140823394, "flos": 794153590272.0, "grad_norm": 0.08734600695876651, "language_loss": 0.8132062, "learning_rate": 0.00034673646670883976, "loss": 0.82379746, "num_input_tokens_seen": 265135920, "router_z_loss_mlp": 0.09973145, "routerloss_mlp": 0.0, "step": 3177, "time_per_iteration": 2.965688705444336 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0101108, "balance_loss_mlp": 1.00557232, "diversity_loss_mlp": 0.0, "epoch": 0.611388995767603, "flos": 1557650663424.0, "grad_norm": 0.01801959168057259, "language_loss": 0.75715023, "learning_rate": 0.0003464399512108141, "loss": 0.76726103, "num_input_tokens_seen": 265374464, "router_z_loss_mlp": 0.05517578, "routerloss_mlp": 0.0, "step": 3178, "time_per_iteration": 4.958420991897583 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00819092, "balance_loss_mlp": 1.39532781, "diversity_loss_mlp": 0.21795917, "epoch": 0.6115813774528664, "flos": 712169210880.0, "grad_norm": 0.031831362939539476, "language_loss": 0.81821573, "learning_rate": 0.0003461434953300865, "loss": 0.82640672, "num_input_tokens_seen": 265450112, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01244847, "step": 3179, "time_per_iteration": 2.92270827293396 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063068, "balance_loss_mlp": 1.05295873, "diversity_loss_mlp": 0.0, "epoch": 0.61177375913813, "flos": 684308072448.0, "grad_norm": 0.055258394831610054, "language_loss": 0.81141388, "learning_rate": 0.0003458470991817515, "loss": 0.82204449, "num_input_tokens_seen": 265534336, "router_z_loss_mlp": 0.10113525, "routerloss_mlp": 0.0, "step": 3180, "time_per_iteration": 2.9693758487701416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060777, "balance_loss_mlp": 1.05068588, "diversity_loss_mlp": 0.0, "epoch": 0.6119661408233936, "flos": 511662127104.0, "grad_norm": 0.06960725666926779, "language_loss": 0.85075366, "learning_rate": 0.0003455507628808802, "loss": 0.86136144, "num_input_tokens_seen": 265604480, "router_z_loss_mlp": 0.10089111, "routerloss_mlp": 0.0, "step": 3181, "time_per_iteration": 2.6036593914031982 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071608, "balance_loss_mlp": 1.06117702, "diversity_loss_mlp": 0.0, "epoch": 0.6121585225086572, "flos": 556809002496.0, "grad_norm": 0.09091925049493645, "language_loss": 0.84135175, "learning_rate": 0.00034525448654252076, "loss": 0.85206783, "num_input_tokens_seen": 265670848, "router_z_loss_mlp": 0.10430908, "routerloss_mlp": 0.0, "step": 3182, "time_per_iteration": 2.636809825897217 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061719, "balance_loss_mlp": 1.05150867, "diversity_loss_mlp": 0.0, "epoch": 0.6123509041939207, "flos": 561849467904.0, "grad_norm": 0.07252100888517035, "language_loss": 0.82806599, "learning_rate": 0.0003449582702816976, "loss": 0.83868313, "num_input_tokens_seen": 265739584, "router_z_loss_mlp": 0.10211182, "routerloss_mlp": 0.0, "step": 3183, "time_per_iteration": 2.707475423812866 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070149, "balance_loss_mlp": 1.05986118, "diversity_loss_mlp": 0.0, "epoch": 0.6125432858791843, "flos": 558056729088.0, "grad_norm": 0.07323153161974344, "language_loss": 0.82831162, "learning_rate": 0.0003446621142134122, "loss": 0.8390131, "num_input_tokens_seen": 265810368, "router_z_loss_mlp": 0.10290527, "routerloss_mlp": 0.0, "step": 3184, "time_per_iteration": 2.6639719009399414 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068209, "balance_loss_mlp": 1.05824375, "diversity_loss_mlp": 0.0, "epoch": 0.6127356675644479, "flos": 415015944192.0, "grad_norm": 0.08088263565451759, "language_loss": 0.84134692, "learning_rate": 0.0003443660184526424, "loss": 0.85202903, "num_input_tokens_seen": 265871616, "router_z_loss_mlp": 0.09960938, "routerloss_mlp": 0.0, "step": 3185, "time_per_iteration": 2.465219736099243 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068542, "balance_loss_mlp": 1.05862343, "diversity_loss_mlp": 0.0, "epoch": 0.6129280492497114, "flos": 603843434496.0, "grad_norm": 0.06289917121629264, "language_loss": 0.86502969, "learning_rate": 0.0003440699831143429, "loss": 0.87571514, "num_input_tokens_seen": 265946672, "router_z_loss_mlp": 0.09912109, "routerloss_mlp": 0.0, "step": 3186, "time_per_iteration": 2.7979393005371094 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062443, "balance_loss_mlp": 1.05262065, "diversity_loss_mlp": 0.0, "epoch": 0.613120430934975, "flos": 519766295040.0, "grad_norm": 0.07676649362634465, "language_loss": 0.82236582, "learning_rate": 0.0003437740083134449, "loss": 0.83299029, "num_input_tokens_seen": 266020640, "router_z_loss_mlp": 0.09814453, "routerloss_mlp": 0.0, "step": 3187, "time_per_iteration": 2.686150312423706 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066248, "balance_loss_mlp": 1.0564487, "diversity_loss_mlp": 0.0, "epoch": 0.6133128126202385, "flos": 511083965952.0, "grad_norm": 0.08991197971935971, "language_loss": 0.83540225, "learning_rate": 0.00034347809416485574, "loss": 0.84606475, "num_input_tokens_seen": 266085776, "router_z_loss_mlp": 0.09790039, "routerloss_mlp": 0.0, "step": 3188, "time_per_iteration": 2.604308605194092 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106686, "balance_loss_mlp": 1.05696571, "diversity_loss_mlp": 0.0, "epoch": 0.6135051943055021, "flos": 607562021376.0, "grad_norm": 0.07330624647380965, "language_loss": 0.81935883, "learning_rate": 0.0003431822407834597, "loss": 0.83002746, "num_input_tokens_seen": 266157104, "router_z_loss_mlp": 0.09887695, "routerloss_mlp": 0.0, "step": 3189, "time_per_iteration": 2.786008596420288 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070082, "balance_loss_mlp": 1.0602051, "diversity_loss_mlp": 0.0, "epoch": 0.6136975759907657, "flos": 1160200931328.0, "grad_norm": 0.07745901872485048, "language_loss": 0.84407461, "learning_rate": 0.00034288644828411706, "loss": 0.85477537, "num_input_tokens_seen": 266244144, "router_z_loss_mlp": 0.09869385, "routerloss_mlp": 0.0, "step": 3190, "time_per_iteration": 3.4646387100219727 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078292, "balance_loss_mlp": 1.06861246, "diversity_loss_mlp": 0.0, "epoch": 0.6138899576760293, "flos": 706938596352.0, "grad_norm": 0.07529521339256182, "language_loss": 0.75715351, "learning_rate": 0.0003425907167816649, "loss": 0.76793635, "num_input_tokens_seen": 266319040, "router_z_loss_mlp": 0.09680176, "routerloss_mlp": 0.0, "step": 3191, "time_per_iteration": 2.874946117401123 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00808796, "balance_loss_mlp": 1.37378812, "diversity_loss_mlp": 0.21839428, "epoch": 0.6140823393612928, "flos": 586443898368.0, "grad_norm": 0.033870623426287425, "language_loss": 0.84848714, "learning_rate": 0.00034229504639091623, "loss": 0.85657513, "num_input_tokens_seen": 266390784, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01270431, "step": 3192, "time_per_iteration": 2.8179514408111572 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074782, "balance_loss_mlp": 1.06519175, "diversity_loss_mlp": 0.0, "epoch": 0.6142747210465563, "flos": 804130633728.0, "grad_norm": 0.07980932307836838, "language_loss": 0.79876941, "learning_rate": 0.0003419994372266606, "loss": 0.80951726, "num_input_tokens_seen": 266483216, "router_z_loss_mlp": 0.09588623, "routerloss_mlp": 0.0, "step": 3193, "time_per_iteration": 3.121509552001953 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070128, "balance_loss_mlp": 1.06069219, "diversity_loss_mlp": 0.0, "epoch": 0.6144671027318199, "flos": 529434620928.0, "grad_norm": 0.05544583647367184, "language_loss": 0.82228541, "learning_rate": 0.00034170388940366335, "loss": 0.83298671, "num_input_tokens_seen": 266557344, "router_z_loss_mlp": 0.09429932, "routerloss_mlp": 0.0, "step": 3194, "time_per_iteration": 2.725961685180664 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071987, "balance_loss_mlp": 1.0625093, "diversity_loss_mlp": 0.0, "epoch": 0.6146594844170835, "flos": 805425348096.0, "grad_norm": 0.06534437990847952, "language_loss": 0.80109018, "learning_rate": 0.0003414084030366667, "loss": 0.81181002, "num_input_tokens_seen": 266639488, "router_z_loss_mlp": 0.0947876, "routerloss_mlp": 0.0, "step": 3195, "time_per_iteration": 3.127318859100342 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073594, "balance_loss_mlp": 1.06399155, "diversity_loss_mlp": 0.0, "epoch": 0.6148518661023471, "flos": 501697193472.0, "grad_norm": 0.07171859971508983, "language_loss": 0.83377409, "learning_rate": 0.0003411129782403883, "loss": 0.84451008, "num_input_tokens_seen": 266711168, "router_z_loss_mlp": 0.09594727, "routerloss_mlp": 0.0, "step": 3196, "time_per_iteration": 2.7145206928253174 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078425, "balance_loss_mlp": 1.06870365, "diversity_loss_mlp": 0.0, "epoch": 0.6150442477876106, "flos": 510688613376.0, "grad_norm": 0.09666217933122766, "language_loss": 0.85076511, "learning_rate": 0.0003408176151295225, "loss": 0.86154932, "num_input_tokens_seen": 266777632, "router_z_loss_mlp": 0.09710693, "routerloss_mlp": 0.0, "step": 3197, "time_per_iteration": 2.5919525623321533 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079298, "balance_loss_mlp": 1.06990433, "diversity_loss_mlp": 0.0, "epoch": 0.6152366294728742, "flos": 527005979136.0, "grad_norm": 0.06581377475358774, "language_loss": 0.77279031, "learning_rate": 0.00034052231381873944, "loss": 0.78358328, "num_input_tokens_seen": 266842880, "router_z_loss_mlp": 0.09387207, "routerloss_mlp": 0.0, "step": 3198, "time_per_iteration": 2.597702741622925 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082219, "balance_loss_mlp": 1.07295024, "diversity_loss_mlp": 0.0, "epoch": 0.6154290111581378, "flos": 473300112384.0, "grad_norm": 0.0683279233493331, "language_loss": 0.85131848, "learning_rate": 0.00034022707442268494, "loss": 0.8621406, "num_input_tokens_seen": 266909504, "router_z_loss_mlp": 0.0927124, "routerloss_mlp": 0.0, "step": 3199, "time_per_iteration": 2.562068223953247 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080014, "balance_loss_mlp": 1.07069743, "diversity_loss_mlp": 0.0, "epoch": 0.6156213928434013, "flos": 550819616256.0, "grad_norm": 0.0761762485373057, "language_loss": 0.82035017, "learning_rate": 0.0003399318970559813, "loss": 0.83115035, "num_input_tokens_seen": 266988880, "router_z_loss_mlp": 0.09313965, "routerloss_mlp": 0.0, "step": 3200, "time_per_iteration": 2.789898157119751 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080478, "balance_loss_mlp": 1.07100666, "diversity_loss_mlp": 0.0, "epoch": 0.6158137745286649, "flos": 750941259264.0, "grad_norm": 0.08069642466901547, "language_loss": 0.84662288, "learning_rate": 0.00033963678183322656, "loss": 0.85742772, "num_input_tokens_seen": 267074512, "router_z_loss_mlp": 0.09466553, "routerloss_mlp": 0.0, "step": 3201, "time_per_iteration": 3.026878595352173 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091206, "balance_loss_mlp": 1.08173513, "diversity_loss_mlp": 0.0, "epoch": 0.6160061562139284, "flos": 555815665152.0, "grad_norm": 0.059556899615455, "language_loss": 0.82784677, "learning_rate": 0.0003393417288689945, "loss": 0.83875883, "num_input_tokens_seen": 267147952, "router_z_loss_mlp": 0.09466553, "routerloss_mlp": 0.0, "step": 3202, "time_per_iteration": 2.6654982566833496 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090691, "balance_loss_mlp": 1.08118427, "diversity_loss_mlp": 0.0, "epoch": 0.616198537899192, "flos": 742177437696.0, "grad_norm": 0.07467788423655687, "language_loss": 0.76113433, "learning_rate": 0.00033904673827783504, "loss": 0.77204126, "num_input_tokens_seen": 267224368, "router_z_loss_mlp": 0.0949707, "routerloss_mlp": 0.0, "step": 3203, "time_per_iteration": 2.92669939994812 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010864, "balance_loss_mlp": 1.07689261, "diversity_loss_mlp": 0.0, "epoch": 0.6163909195844556, "flos": 478810082304.0, "grad_norm": 0.06286363142909755, "language_loss": 0.8181622, "learning_rate": 0.00033875181017427357, "loss": 0.82902622, "num_input_tokens_seen": 267292688, "router_z_loss_mlp": 0.0949707, "routerloss_mlp": 0.0, "step": 3204, "time_per_iteration": 2.5680675506591797 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090188, "balance_loss_mlp": 1.08068752, "diversity_loss_mlp": 0.0, "epoch": 0.6165833012697192, "flos": 531517469184.0, "grad_norm": 0.07085405603281952, "language_loss": 0.81132901, "learning_rate": 0.00033845694467281133, "loss": 0.82223082, "num_input_tokens_seen": 267371888, "router_z_loss_mlp": 0.09490967, "routerloss_mlp": 0.0, "step": 3205, "time_per_iteration": 2.8592958450317383 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00806951, "balance_loss_mlp": 1.37197065, "diversity_loss_mlp": 0.21751499, "epoch": 0.6167756829549826, "flos": 807765156864.0, "grad_norm": 0.030824309293312202, "language_loss": 0.83412218, "learning_rate": 0.00033816214188792516, "loss": 0.84219164, "num_input_tokens_seen": 267458784, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01220786, "step": 3206, "time_per_iteration": 3.1863744258880615 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087794, "balance_loss_mlp": 1.07844186, "diversity_loss_mlp": 0.0, "epoch": 0.6169680646402462, "flos": 488928089088.0, "grad_norm": 0.07935266980456598, "language_loss": 0.85488075, "learning_rate": 0.00033786740193406784, "loss": 0.86575866, "num_input_tokens_seen": 267528528, "router_z_loss_mlp": 0.09344482, "routerloss_mlp": 0.0, "step": 3207, "time_per_iteration": 2.626253604888916 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108628, "balance_loss_mlp": 1.07682097, "diversity_loss_mlp": 0.0, "epoch": 0.6171604463255098, "flos": 618954918912.0, "grad_norm": 0.07540350896316815, "language_loss": 0.81724775, "learning_rate": 0.00033757272492566736, "loss": 0.82811046, "num_input_tokens_seen": 267611152, "router_z_loss_mlp": 0.09454346, "routerloss_mlp": 0.0, "step": 3208, "time_per_iteration": 2.8899030685424805 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080715, "balance_loss_mlp": 1.07114851, "diversity_loss_mlp": 0.0, "epoch": 0.6173528280107734, "flos": 528859031040.0, "grad_norm": 0.05796890161537444, "language_loss": 0.87216032, "learning_rate": 0.0003372781109771278, "loss": 0.88296747, "num_input_tokens_seen": 267681520, "router_z_loss_mlp": 0.09558105, "routerloss_mlp": 0.0, "step": 3209, "time_per_iteration": 2.752558708190918 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077325, "balance_loss_mlp": 1.06753802, "diversity_loss_mlp": 0.0, "epoch": 0.617545209696037, "flos": 596581728768.0, "grad_norm": 0.06419749590312054, "language_loss": 0.76373756, "learning_rate": 0.0003369835602028281, "loss": 0.7745108, "num_input_tokens_seen": 267758768, "router_z_loss_mlp": 0.09777832, "routerloss_mlp": 0.0, "step": 3210, "time_per_iteration": 2.7878270149230957 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068114, "balance_loss_mlp": 1.05842817, "diversity_loss_mlp": 0.0, "epoch": 0.6177375913813005, "flos": 475098835968.0, "grad_norm": 0.0669620080474601, "language_loss": 0.79502624, "learning_rate": 0.0003366890727171232, "loss": 0.8057074, "num_input_tokens_seen": 267831056, "router_z_loss_mlp": 0.09680176, "routerloss_mlp": 0.0, "step": 3211, "time_per_iteration": 2.7112903594970703 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069089, "balance_loss_mlp": 1.05950451, "diversity_loss_mlp": 0.0, "epoch": 0.617929973066564, "flos": 529812721152.0, "grad_norm": 0.08442057123784988, "language_loss": 0.78359348, "learning_rate": 0.00033639464863434313, "loss": 0.79428434, "num_input_tokens_seen": 267898416, "router_z_loss_mlp": 0.09570312, "routerloss_mlp": 0.0, "step": 3212, "time_per_iteration": 2.634425163269043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01035652, "balance_loss_mlp": 1.03023958, "diversity_loss_mlp": 0.0, "epoch": 0.6181223547518276, "flos": 1420053783552.0, "grad_norm": 0.02134222442632316, "language_loss": 0.78442466, "learning_rate": 0.00033610028806879363, "loss": 0.79478121, "num_input_tokens_seen": 268112864, "router_z_loss_mlp": 0.05419922, "routerloss_mlp": 0.0, "step": 3213, "time_per_iteration": 4.7891459465026855 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066789, "balance_loss_mlp": 1.05715084, "diversity_loss_mlp": 0.0, "epoch": 0.6183147364370912, "flos": 740319243264.0, "grad_norm": 0.07602232380536252, "language_loss": 0.79711038, "learning_rate": 0.00033580599113475543, "loss": 0.80777824, "num_input_tokens_seen": 268198368, "router_z_loss_mlp": 0.09637451, "routerloss_mlp": 0.0, "step": 3214, "time_per_iteration": 2.987006187438965 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065135, "balance_loss_mlp": 1.0553956, "diversity_loss_mlp": 0.0, "epoch": 0.6185071181223547, "flos": 381649978368.0, "grad_norm": 0.0762428760353498, "language_loss": 0.86394417, "learning_rate": 0.00033551175794648507, "loss": 0.87459552, "num_input_tokens_seen": 268260704, "router_z_loss_mlp": 0.09735107, "routerloss_mlp": 0.0, "step": 3215, "time_per_iteration": 2.4780433177948 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064383, "balance_loss_mlp": 1.05447078, "diversity_loss_mlp": 0.0, "epoch": 0.6186994998076183, "flos": 463347661824.0, "grad_norm": 0.059308624592263506, "language_loss": 0.81911212, "learning_rate": 0.00033521758861821365, "loss": 0.82975602, "num_input_tokens_seen": 268328256, "router_z_loss_mlp": 0.09906006, "routerloss_mlp": 0.0, "step": 3216, "time_per_iteration": 2.5746333599090576 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062859, "balance_loss_mlp": 1.05332255, "diversity_loss_mlp": 0.0, "epoch": 0.6188918814928819, "flos": 485273742336.0, "grad_norm": 0.06339313693664829, "language_loss": 0.89093363, "learning_rate": 0.0003349234832641479, "loss": 0.90156221, "num_input_tokens_seen": 268394016, "router_z_loss_mlp": 0.09527588, "routerloss_mlp": 0.0, "step": 3217, "time_per_iteration": 2.561518669128418 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062704, "balance_loss_mlp": 1.05323243, "diversity_loss_mlp": 0.0, "epoch": 0.6190842631781455, "flos": 657307021824.0, "grad_norm": 0.07035473810033784, "language_loss": 0.81230485, "learning_rate": 0.00033462944199846975, "loss": 0.82293189, "num_input_tokens_seen": 268478512, "router_z_loss_mlp": 0.09466553, "routerloss_mlp": 0.0, "step": 3218, "time_per_iteration": 3.0372345447540283 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065156, "balance_loss_mlp": 1.05549467, "diversity_loss_mlp": 0.0, "epoch": 0.619276644863409, "flos": 403603223040.0, "grad_norm": 0.07112802613336307, "language_loss": 0.86179578, "learning_rate": 0.00033433546493533606, "loss": 0.87244731, "num_input_tokens_seen": 268540304, "router_z_loss_mlp": 0.09655762, "routerloss_mlp": 0.0, "step": 3219, "time_per_iteration": 2.4615468978881836 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066941, "balance_loss_mlp": 1.05763078, "diversity_loss_mlp": 0.0, "epoch": 0.6194690265486725, "flos": 583093499904.0, "grad_norm": 0.07983484825062852, "language_loss": 0.84651643, "learning_rate": 0.00033404155218887897, "loss": 0.8571859, "num_input_tokens_seen": 268611136, "router_z_loss_mlp": 0.09307861, "routerloss_mlp": 0.0, "step": 3220, "time_per_iteration": 2.725001335144043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066491, "balance_loss_mlp": 1.05722845, "diversity_loss_mlp": 0.0, "epoch": 0.6196614082339361, "flos": 504246974976.0, "grad_norm": 0.05498489673307501, "language_loss": 0.87258649, "learning_rate": 0.00033374770387320534, "loss": 0.88325131, "num_input_tokens_seen": 268684992, "router_z_loss_mlp": 0.0925293, "routerloss_mlp": 0.0, "step": 3221, "time_per_iteration": 2.7884719371795654 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066206, "balance_loss_mlp": 1.05684233, "diversity_loss_mlp": 0.0, "epoch": 0.6198537899191997, "flos": 575409277440.0, "grad_norm": 0.06826724081601121, "language_loss": 0.85091376, "learning_rate": 0.00033345392010239737, "loss": 0.86157584, "num_input_tokens_seen": 268758096, "router_z_loss_mlp": 0.09362793, "routerloss_mlp": 0.0, "step": 3222, "time_per_iteration": 2.758528232574463 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072791, "balance_loss_mlp": 1.06346869, "diversity_loss_mlp": 0.0, "epoch": 0.6200461716044633, "flos": 593157178368.0, "grad_norm": 0.07112470494876487, "language_loss": 0.82199866, "learning_rate": 0.0003331602009905118, "loss": 0.8327266, "num_input_tokens_seen": 268834432, "router_z_loss_mlp": 0.09326172, "routerloss_mlp": 0.0, "step": 3223, "time_per_iteration": 2.7497544288635254 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073437, "balance_loss_mlp": 1.06405497, "diversity_loss_mlp": 0.0, "epoch": 0.6202385532897268, "flos": 666093238272.0, "grad_norm": 0.06198906744782324, "language_loss": 0.8420788, "learning_rate": 0.00033286654665158085, "loss": 0.85281318, "num_input_tokens_seen": 268921168, "router_z_loss_mlp": 0.09375, "routerloss_mlp": 0.0, "step": 3224, "time_per_iteration": 2.938769817352295 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00805444, "balance_loss_mlp": 1.36691594, "diversity_loss_mlp": 0.21943557, "epoch": 0.6204309349749904, "flos": 484952541696.0, "grad_norm": 0.03128305924884035, "language_loss": 0.87915754, "learning_rate": 0.0003325729571996109, "loss": 0.88721198, "num_input_tokens_seen": 268991440, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01226849, "step": 3225, "time_per_iteration": 2.6774377822875977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080369, "balance_loss_mlp": 1.07079625, "diversity_loss_mlp": 0.0, "epoch": 0.6206233166602539, "flos": 584057101824.0, "grad_norm": 0.15310961758991004, "language_loss": 0.83791566, "learning_rate": 0.000332279432748584, "loss": 0.8487193, "num_input_tokens_seen": 269061024, "router_z_loss_mlp": 0.09570312, "routerloss_mlp": 0.0, "step": 3226, "time_per_iteration": 2.723944664001465 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078837, "balance_loss_mlp": 1.06965768, "diversity_loss_mlp": 0.0, "epoch": 0.6208156983455175, "flos": 476917383168.0, "grad_norm": 0.06102841985942585, "language_loss": 0.87609762, "learning_rate": 0.00033198597341245576, "loss": 0.886886, "num_input_tokens_seen": 269130560, "router_z_loss_mlp": 0.0916748, "routerloss_mlp": 0.0, "step": 3227, "time_per_iteration": 2.6077282428741455 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107844, "balance_loss_mlp": 1.06877792, "diversity_loss_mlp": 0.0, "epoch": 0.6210080800307811, "flos": 789066137088.0, "grad_norm": 0.05859377500804419, "language_loss": 0.81977952, "learning_rate": 0.00033169257930515763, "loss": 0.8305639, "num_input_tokens_seen": 269213280, "router_z_loss_mlp": 0.09655762, "routerloss_mlp": 0.0, "step": 3228, "time_per_iteration": 3.0201709270477295 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079582, "balance_loss_mlp": 1.06983042, "diversity_loss_mlp": 0.0, "epoch": 0.6212004617160446, "flos": 607794388992.0, "grad_norm": 0.06260829937623101, "language_loss": 0.81892502, "learning_rate": 0.0003313992505405951, "loss": 0.82972085, "num_input_tokens_seen": 269286384, "router_z_loss_mlp": 0.09741211, "routerloss_mlp": 0.0, "step": 3229, "time_per_iteration": 2.7065281867980957 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085739, "balance_loss_mlp": 1.07612467, "diversity_loss_mlp": 0.0, "epoch": 0.6213928434013082, "flos": 586520621568.0, "grad_norm": 0.07524693848551285, "language_loss": 0.81223184, "learning_rate": 0.0003311059872326487, "loss": 0.82308924, "num_input_tokens_seen": 269353296, "router_z_loss_mlp": 0.09606934, "routerloss_mlp": 0.0, "step": 3230, "time_per_iteration": 2.6831164360046387 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082096, "balance_loss_mlp": 1.07257652, "diversity_loss_mlp": 0.0, "epoch": 0.6215852250865718, "flos": 536076320256.0, "grad_norm": 0.08041283658351392, "language_loss": 0.792005, "learning_rate": 0.0003308127894951734, "loss": 0.80282593, "num_input_tokens_seen": 269422304, "router_z_loss_mlp": 0.09509277, "routerloss_mlp": 0.0, "step": 3231, "time_per_iteration": 2.6133408546447754 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087839, "balance_loss_mlp": 1.07829607, "diversity_loss_mlp": 0.0, "epoch": 0.6217776067718354, "flos": 618169356288.0, "grad_norm": 0.0806270364015219, "language_loss": 0.86446661, "learning_rate": 0.00033051965744199834, "loss": 0.87534499, "num_input_tokens_seen": 269498784, "router_z_loss_mlp": 0.09533691, "routerloss_mlp": 0.0, "step": 3232, "time_per_iteration": 2.7565104961395264 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081354, "balance_loss_mlp": 1.07194829, "diversity_loss_mlp": 0.0, "epoch": 0.6219699884570988, "flos": 545875324416.0, "grad_norm": 0.06624380464527684, "language_loss": 0.90293765, "learning_rate": 0.0003302265911869276, "loss": 0.91375124, "num_input_tokens_seen": 269581264, "router_z_loss_mlp": 0.09393311, "routerloss_mlp": 0.0, "step": 3233, "time_per_iteration": 2.926671266555786 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070794, "balance_loss_mlp": 1.06132245, "diversity_loss_mlp": 0.0, "epoch": 0.6221623701423624, "flos": 481149891072.0, "grad_norm": 0.08213933441923858, "language_loss": 0.84280741, "learning_rate": 0.0003299335908437397, "loss": 0.85351539, "num_input_tokens_seen": 269649408, "router_z_loss_mlp": 0.09472656, "routerloss_mlp": 0.0, "step": 3234, "time_per_iteration": 2.5910556316375732 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074187, "balance_loss_mlp": 1.06473994, "diversity_loss_mlp": 0.0, "epoch": 0.622354751827626, "flos": 380024151552.0, "grad_norm": 0.08585428313311574, "language_loss": 0.79975766, "learning_rate": 0.0003296406565261873, "loss": 0.81049955, "num_input_tokens_seen": 269711648, "router_z_loss_mlp": 0.09436035, "routerloss_mlp": 0.0, "step": 3235, "time_per_iteration": 2.4815149307250977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069588, "balance_loss_mlp": 1.06017601, "diversity_loss_mlp": 0.0, "epoch": 0.6225471335128896, "flos": 667869940224.0, "grad_norm": 0.07182021420774376, "language_loss": 0.84884858, "learning_rate": 0.0003293477883479978, "loss": 0.85954452, "num_input_tokens_seen": 269787376, "router_z_loss_mlp": 0.09399414, "routerloss_mlp": 0.0, "step": 3236, "time_per_iteration": 2.821707248687744 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069407, "balance_loss_mlp": 1.05992377, "diversity_loss_mlp": 0.0, "epoch": 0.6227395151981532, "flos": 771320807424.0, "grad_norm": 0.08520791019751349, "language_loss": 0.79754794, "learning_rate": 0.0003290549864228727, "loss": 0.80824208, "num_input_tokens_seen": 269863008, "router_z_loss_mlp": 0.09472656, "routerloss_mlp": 0.0, "step": 3237, "time_per_iteration": 2.932542324066162 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075658, "balance_loss_mlp": 1.06604934, "diversity_loss_mlp": 0.0, "epoch": 0.6229318968834167, "flos": 484354556928.0, "grad_norm": 0.07053580491728426, "language_loss": 0.86281902, "learning_rate": 0.0003287622508644875, "loss": 0.87357557, "num_input_tokens_seen": 269939552, "router_z_loss_mlp": 0.09594727, "routerloss_mlp": 0.0, "step": 3238, "time_per_iteration": 2.742324113845825 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00814101, "balance_loss_mlp": 1.38574493, "diversity_loss_mlp": 0.21743111, "epoch": 0.6231242785686802, "flos": 462935056896.0, "grad_norm": 0.03587473659698897, "language_loss": 0.86128193, "learning_rate": 0.0003284695817864923, "loss": 0.86942297, "num_input_tokens_seen": 270002752, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01251296, "step": 3239, "time_per_iteration": 2.5240445137023926 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071749, "balance_loss_mlp": 1.06229532, "diversity_loss_mlp": 0.0, "epoch": 0.6233166602539438, "flos": 609089103360.0, "grad_norm": 0.08834225044652763, "language_loss": 0.84207428, "learning_rate": 0.0003281769793025116, "loss": 0.85279179, "num_input_tokens_seen": 270075696, "router_z_loss_mlp": 0.09454346, "routerloss_mlp": 0.0, "step": 3240, "time_per_iteration": 2.733356237411499 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00812174, "balance_loss_mlp": 1.3801111, "diversity_loss_mlp": 0.21927354, "epoch": 0.6235090419392074, "flos": 439200340992.0, "grad_norm": 0.03793852776762896, "language_loss": 0.8948651, "learning_rate": 0.00032788444352614346, "loss": 0.90298682, "num_input_tokens_seen": 270139872, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01248194, "step": 3241, "time_per_iteration": 2.599942922592163 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077839, "balance_loss_mlp": 1.06840372, "diversity_loss_mlp": 0.0, "epoch": 0.6237014236244709, "flos": 504904430592.0, "grad_norm": 0.07096292336409799, "language_loss": 0.80582923, "learning_rate": 0.0003275919745709606, "loss": 0.81660759, "num_input_tokens_seen": 270206752, "router_z_loss_mlp": 0.09429932, "routerloss_mlp": 0.0, "step": 3242, "time_per_iteration": 2.5855822563171387 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079567, "balance_loss_mlp": 1.07014906, "diversity_loss_mlp": 0.0, "epoch": 0.6238938053097345, "flos": 512917194240.0, "grad_norm": 0.06686828549294242, "language_loss": 0.81972641, "learning_rate": 0.00032729957255050936, "loss": 0.83052206, "num_input_tokens_seen": 270275472, "router_z_loss_mlp": 0.09423828, "routerloss_mlp": 0.0, "step": 3243, "time_per_iteration": 2.652064561843872 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079309, "balance_loss_mlp": 1.06973052, "diversity_loss_mlp": 0.0, "epoch": 0.6240861869949981, "flos": 736751531520.0, "grad_norm": 0.0716805986451115, "language_loss": 0.81674051, "learning_rate": 0.0003270072375783102, "loss": 0.8275336, "num_input_tokens_seen": 270348336, "router_z_loss_mlp": 0.09570312, "routerloss_mlp": 0.0, "step": 3244, "time_per_iteration": 2.894718647003174 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070218, "balance_loss_mlp": 1.06071746, "diversity_loss_mlp": 0.0, "epoch": 0.6242785686802617, "flos": 494712271872.0, "grad_norm": 0.06745739273028781, "language_loss": 0.79402959, "learning_rate": 0.00032671496976785774, "loss": 0.80473179, "num_input_tokens_seen": 270416496, "router_z_loss_mlp": 0.09503174, "routerloss_mlp": 0.0, "step": 3245, "time_per_iteration": 2.637991428375244 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077124, "balance_loss_mlp": 1.06772995, "diversity_loss_mlp": 0.0, "epoch": 0.6244709503655252, "flos": 745846465536.0, "grad_norm": 0.06297519573167677, "language_loss": 0.7578575, "learning_rate": 0.0003264227692326205, "loss": 0.76862872, "num_input_tokens_seen": 270501680, "router_z_loss_mlp": 0.09399414, "routerloss_mlp": 0.0, "step": 3246, "time_per_iteration": 3.0627310276031494 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010763, "balance_loss_mlp": 1.06653643, "diversity_loss_mlp": 0.0, "epoch": 0.6246633320507887, "flos": 492602259456.0, "grad_norm": 0.06711643928809063, "language_loss": 0.85974544, "learning_rate": 0.00032613063608604055, "loss": 0.87050849, "num_input_tokens_seen": 270568656, "router_z_loss_mlp": 0.09753418, "routerloss_mlp": 0.0, "step": 3247, "time_per_iteration": 2.6602516174316406 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074686, "balance_loss_mlp": 1.0650897, "diversity_loss_mlp": 0.0, "epoch": 0.6248557137360523, "flos": 517391981568.0, "grad_norm": 0.06836828090896512, "language_loss": 0.8368777, "learning_rate": 0.0003258385704415343, "loss": 0.84762454, "num_input_tokens_seen": 270636160, "router_z_loss_mlp": 0.09594727, "routerloss_mlp": 0.0, "step": 3248, "time_per_iteration": 2.5850605964660645 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068989, "balance_loss_mlp": 1.05929732, "diversity_loss_mlp": 0.0, "epoch": 0.6250480954213159, "flos": 519363601920.0, "grad_norm": 0.0567839390219681, "language_loss": 0.82901073, "learning_rate": 0.0003255465724124915, "loss": 0.83970058, "num_input_tokens_seen": 270708816, "router_z_loss_mlp": 0.09680176, "routerloss_mlp": 0.0, "step": 3249, "time_per_iteration": 2.7133941650390625 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068793, "balance_loss_mlp": 1.05952442, "diversity_loss_mlp": 0.0, "epoch": 0.6252404771065795, "flos": 516060191232.0, "grad_norm": 0.05839887652934639, "language_loss": 0.82966471, "learning_rate": 0.00032525464211227587, "loss": 0.84035265, "num_input_tokens_seen": 270778016, "router_z_loss_mlp": 0.0927124, "routerloss_mlp": 0.0, "step": 3250, "time_per_iteration": 2.611469030380249 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071121, "balance_loss_mlp": 1.06180525, "diversity_loss_mlp": 0.0, "epoch": 0.6254328587918431, "flos": 576916535808.0, "grad_norm": 0.07351416510504778, "language_loss": 0.85770059, "learning_rate": 0.0003249627796542249, "loss": 0.8684119, "num_input_tokens_seen": 270847072, "router_z_loss_mlp": 0.09307861, "routerloss_mlp": 0.0, "step": 3251, "time_per_iteration": 2.6665618419647217 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066555, "balance_loss_mlp": 1.05709553, "diversity_loss_mlp": 0.0, "epoch": 0.6256252404771065, "flos": 597930771456.0, "grad_norm": 0.06415360650327814, "language_loss": 0.84284747, "learning_rate": 0.00032467098515164943, "loss": 0.853513, "num_input_tokens_seen": 270926320, "router_z_loss_mlp": 0.09448242, "routerloss_mlp": 0.0, "step": 3252, "time_per_iteration": 2.8863329887390137 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069653, "balance_loss_mlp": 1.06005657, "diversity_loss_mlp": 0.0, "epoch": 0.6258176221623701, "flos": 508299245568.0, "grad_norm": 0.07319159145136593, "language_loss": 0.83726692, "learning_rate": 0.00032437925871783456, "loss": 0.84796345, "num_input_tokens_seen": 270997904, "router_z_loss_mlp": 0.09588623, "routerloss_mlp": 0.0, "step": 3253, "time_per_iteration": 2.6411869525909424 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107529, "balance_loss_mlp": 1.06570566, "diversity_loss_mlp": 0.0, "epoch": 0.6260100038476337, "flos": 639645755904.0, "grad_norm": 0.06969705547120199, "language_loss": 0.84202456, "learning_rate": 0.00032408760046603803, "loss": 0.85277742, "num_input_tokens_seen": 271074256, "router_z_loss_mlp": 0.09588623, "routerloss_mlp": 0.0, "step": 3254, "time_per_iteration": 2.79947829246521 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070892, "balance_loss_mlp": 1.06131983, "diversity_loss_mlp": 0.0, "epoch": 0.6262023855328973, "flos": 841007784960.0, "grad_norm": 0.06622216529123302, "language_loss": 0.77594912, "learning_rate": 0.00032379601050949193, "loss": 0.78665805, "num_input_tokens_seen": 271155152, "router_z_loss_mlp": 0.09564209, "routerloss_mlp": 0.0, "step": 3255, "time_per_iteration": 3.089614152908325 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073078, "balance_loss_mlp": 1.06385732, "diversity_loss_mlp": 0.0, "epoch": 0.6263947672181608, "flos": 522138410496.0, "grad_norm": 0.06913459813204618, "language_loss": 0.88098216, "learning_rate": 0.0003235044889614013, "loss": 0.8917129, "num_input_tokens_seen": 271224784, "router_z_loss_mlp": 0.09228516, "routerloss_mlp": 0.0, "step": 3256, "time_per_iteration": 2.5961923599243164 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076462, "balance_loss_mlp": 1.0670923, "diversity_loss_mlp": 0.0, "epoch": 0.6265871489034244, "flos": 607055440896.0, "grad_norm": 0.07985483332339025, "language_loss": 0.83828497, "learning_rate": 0.0003232130359349451, "loss": 0.84904957, "num_input_tokens_seen": 271303584, "router_z_loss_mlp": 0.09362793, "routerloss_mlp": 0.0, "step": 3257, "time_per_iteration": 2.8164010047912598 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106986, "balance_loss_mlp": 1.06043053, "diversity_loss_mlp": 0.0, "epoch": 0.626779530588688, "flos": 588484901376.0, "grad_norm": 0.06128522405733426, "language_loss": 0.81820428, "learning_rate": 0.0003229216515432751, "loss": 0.82890296, "num_input_tokens_seen": 271379632, "router_z_loss_mlp": 0.09429932, "routerloss_mlp": 0.0, "step": 3258, "time_per_iteration": 2.7743678092956543 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00804618, "balance_loss_mlp": 1.36253858, "diversity_loss_mlp": 0.22081783, "epoch": 0.6269719122739515, "flos": 438612268032.0, "grad_norm": 0.03450370763198899, "language_loss": 0.80067343, "learning_rate": 0.0003226303358995174, "loss": 0.80871964, "num_input_tokens_seen": 271447808, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01293936, "step": 3259, "time_per_iteration": 2.6309425830841064 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065495, "balance_loss_mlp": 1.05593443, "diversity_loss_mlp": 0.0, "epoch": 0.6271642939592151, "flos": 562874738688.0, "grad_norm": 0.05636981182900784, "language_loss": 0.88916153, "learning_rate": 0.00032233908911677, "loss": 0.89981651, "num_input_tokens_seen": 271526768, "router_z_loss_mlp": 0.09552002, "routerloss_mlp": 0.0, "step": 3260, "time_per_iteration": 2.847928524017334 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072322, "balance_loss_mlp": 1.06297052, "diversity_loss_mlp": 0.0, "epoch": 0.6273566756444786, "flos": 514560273408.0, "grad_norm": 0.07940970349438319, "language_loss": 0.810615, "learning_rate": 0.0003220479113081053, "loss": 0.8213383, "num_input_tokens_seen": 271597840, "router_z_loss_mlp": 0.09344482, "routerloss_mlp": 0.0, "step": 3261, "time_per_iteration": 2.7070260047912598 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070214, "balance_loss_mlp": 1.06123137, "diversity_loss_mlp": 0.0, "epoch": 0.6275490573297422, "flos": 585472955904.0, "grad_norm": 0.06801817573689214, "language_loss": 0.78964686, "learning_rate": 0.00032175680258656836, "loss": 0.80034894, "num_input_tokens_seen": 271668352, "router_z_loss_mlp": 0.08990479, "routerloss_mlp": 0.0, "step": 3262, "time_per_iteration": 2.7481493949890137 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067516, "balance_loss_mlp": 1.05819941, "diversity_loss_mlp": 0.0, "epoch": 0.6277414390150058, "flos": 559423024128.0, "grad_norm": 0.06408124041259919, "language_loss": 0.80091017, "learning_rate": 0.00032146576306517794, "loss": 0.81158531, "num_input_tokens_seen": 271743936, "router_z_loss_mlp": 0.09320068, "routerloss_mlp": 0.0, "step": 3263, "time_per_iteration": 2.799330949783325 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071623, "balance_loss_mlp": 1.06242585, "diversity_loss_mlp": 0.0, "epoch": 0.6279338207002694, "flos": 612706374144.0, "grad_norm": 0.06510106509747231, "language_loss": 0.80605328, "learning_rate": 0.0003211747928569255, "loss": 0.81676954, "num_input_tokens_seen": 271817008, "router_z_loss_mlp": 0.09197998, "routerloss_mlp": 0.0, "step": 3264, "time_per_iteration": 2.71992826461792 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071469, "balance_loss_mlp": 1.06197381, "diversity_loss_mlp": 0.0, "epoch": 0.6281262023855329, "flos": 625685451264.0, "grad_norm": 0.06441574996580214, "language_loss": 0.8154881, "learning_rate": 0.0003208838920747754, "loss": 0.82620275, "num_input_tokens_seen": 271896960, "router_z_loss_mlp": 0.0949707, "routerloss_mlp": 0.0, "step": 3265, "time_per_iteration": 2.8526246547698975 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073261, "balance_loss_mlp": 1.06409347, "diversity_loss_mlp": 0.0, "epoch": 0.6283185840707964, "flos": 1123600564224.0, "grad_norm": 0.07893812182761015, "language_loss": 0.76554495, "learning_rate": 0.0003205930608316656, "loss": 0.7762776, "num_input_tokens_seen": 271985008, "router_z_loss_mlp": 0.0916748, "routerloss_mlp": 0.0, "step": 3266, "time_per_iteration": 3.4734575748443604 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066498, "balance_loss_mlp": 1.05708683, "diversity_loss_mlp": 0.0, "epoch": 0.62851096575606, "flos": 515239750656.0, "grad_norm": 0.06620674427686414, "language_loss": 0.85159075, "learning_rate": 0.00032030229924050673, "loss": 0.86225569, "num_input_tokens_seen": 272056368, "router_z_loss_mlp": 0.09399414, "routerloss_mlp": 0.0, "step": 3267, "time_per_iteration": 2.7024662494659424 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072026, "balance_loss_mlp": 1.06285858, "diversity_loss_mlp": 0.0, "epoch": 0.6287033474413236, "flos": 404171472384.0, "grad_norm": 0.06417389888600762, "language_loss": 0.79950488, "learning_rate": 0.00032001160741418247, "loss": 0.81022519, "num_input_tokens_seen": 272123424, "router_z_loss_mlp": 0.09173584, "routerloss_mlp": 0.0, "step": 3268, "time_per_iteration": 2.6112074851989746 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066415, "balance_loss_mlp": 1.05720639, "diversity_loss_mlp": 0.0, "epoch": 0.6288957291265872, "flos": 525718605312.0, "grad_norm": 0.08748068388552233, "language_loss": 0.82228744, "learning_rate": 0.0003197209854655494, "loss": 0.83295155, "num_input_tokens_seen": 272193008, "router_z_loss_mlp": 0.09216309, "routerloss_mlp": 0.0, "step": 3269, "time_per_iteration": 2.642714500427246 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064334, "balance_loss_mlp": 1.05507767, "diversity_loss_mlp": 0.0, "epoch": 0.6290881108118507, "flos": 603722294784.0, "grad_norm": 0.07987454353472763, "language_loss": 0.74589109, "learning_rate": 0.0003194304335074371, "loss": 0.7565344, "num_input_tokens_seen": 272275328, "router_z_loss_mlp": 0.09259033, "routerloss_mlp": 0.0, "step": 3270, "time_per_iteration": 2.8935019969940186 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061724, "balance_loss_mlp": 1.05230033, "diversity_loss_mlp": 0.0, "epoch": 0.6292804924971143, "flos": 437675830272.0, "grad_norm": 0.07476368913364388, "language_loss": 0.8843264, "learning_rate": 0.0003191399516526475, "loss": 0.89494365, "num_input_tokens_seen": 272339328, "router_z_loss_mlp": 0.09405518, "routerloss_mlp": 0.0, "step": 3271, "time_per_iteration": 2.5182955265045166 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010675, "balance_loss_mlp": 1.0580647, "diversity_loss_mlp": 0.0, "epoch": 0.6294728741823779, "flos": 606662659584.0, "grad_norm": 0.0671044499872579, "language_loss": 0.79825693, "learning_rate": 0.0003188495400139559, "loss": 0.80893195, "num_input_tokens_seen": 272416336, "router_z_loss_mlp": 0.09429932, "routerloss_mlp": 0.0, "step": 3272, "time_per_iteration": 2.834392786026001 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106563, "balance_loss_mlp": 1.05608094, "diversity_loss_mlp": 0.0, "epoch": 0.6296652558676414, "flos": 701529942528.0, "grad_norm": 0.07440991142052084, "language_loss": 0.84596652, "learning_rate": 0.00031855919870411013, "loss": 0.85662282, "num_input_tokens_seen": 272490368, "router_z_loss_mlp": 0.09539795, "routerloss_mlp": 0.0, "step": 3273, "time_per_iteration": 2.8662502765655518 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067391, "balance_loss_mlp": 1.05781233, "diversity_loss_mlp": 0.0, "epoch": 0.6298576375529049, "flos": 523909969920.0, "grad_norm": 0.06934000715416044, "language_loss": 0.8508203, "learning_rate": 0.0003182689278358305, "loss": 0.86149418, "num_input_tokens_seen": 272562992, "router_z_loss_mlp": 0.09564209, "routerloss_mlp": 0.0, "step": 3274, "time_per_iteration": 2.707679510116577 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071338, "balance_loss_mlp": 1.06173623, "diversity_loss_mlp": 0.0, "epoch": 0.6300500192381685, "flos": 475963693056.0, "grad_norm": 0.08830765837123684, "language_loss": 0.79631943, "learning_rate": 0.0003179787275218105, "loss": 0.80703276, "num_input_tokens_seen": 272629456, "router_z_loss_mlp": 0.09588623, "routerloss_mlp": 0.0, "step": 3275, "time_per_iteration": 2.6076841354370117 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00806629, "balance_loss_mlp": 1.3660543, "diversity_loss_mlp": 0.22307114, "epoch": 0.6302424009234321, "flos": 520880772096.0, "grad_norm": 0.030809011685951734, "language_loss": 0.84306061, "learning_rate": 0.0003176885978747155, "loss": 0.85112691, "num_input_tokens_seen": 272697440, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01206683, "step": 3276, "time_per_iteration": 2.6712234020233154 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070055, "balance_loss_mlp": 1.06039953, "diversity_loss_mlp": 0.0, "epoch": 0.6304347826086957, "flos": 694596777984.0, "grad_norm": 0.05912857494905308, "language_loss": 0.82393259, "learning_rate": 0.0003173985390071839, "loss": 0.83463317, "num_input_tokens_seen": 272774080, "router_z_loss_mlp": 0.09643555, "routerloss_mlp": 0.0, "step": 3277, "time_per_iteration": 2.8781204223632812 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01020459, "balance_loss_mlp": 1.01545238, "diversity_loss_mlp": 0.0, "epoch": 0.6306271642939593, "flos": 1466858045952.0, "grad_norm": 0.014813696367821054, "language_loss": 0.77900457, "learning_rate": 0.00031710855103182675, "loss": 0.78920913, "num_input_tokens_seen": 272998512, "router_z_loss_mlp": 0.05004883, "routerloss_mlp": 0.0, "step": 3278, "time_per_iteration": 4.869734287261963 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071906, "balance_loss_mlp": 1.06190431, "diversity_loss_mlp": 0.0, "epoch": 0.6308195459792227, "flos": 601740762624.0, "grad_norm": 0.07813339799532502, "language_loss": 0.80876654, "learning_rate": 0.00031681863406122704, "loss": 0.8194856, "num_input_tokens_seen": 273074672, "router_z_loss_mlp": 0.09997559, "routerloss_mlp": 0.0, "step": 3279, "time_per_iteration": 2.773547410964966 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074089, "balance_loss_mlp": 1.06446278, "diversity_loss_mlp": 0.0, "epoch": 0.6310119276644863, "flos": 726858178560.0, "grad_norm": 0.07216916580711319, "language_loss": 0.85329819, "learning_rate": 0.00031652878820794087, "loss": 0.86403906, "num_input_tokens_seen": 273157904, "router_z_loss_mlp": 0.09619141, "routerloss_mlp": 0.0, "step": 3280, "time_per_iteration": 2.980884552001953 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070557, "balance_loss_mlp": 1.0605855, "diversity_loss_mlp": 0.0, "epoch": 0.6312043093497499, "flos": 519749042688.0, "grad_norm": 0.08329353384521647, "language_loss": 0.85882401, "learning_rate": 0.00031623901358449627, "loss": 0.8695296, "num_input_tokens_seen": 273228160, "router_z_loss_mlp": 0.09967041, "routerloss_mlp": 0.0, "step": 3281, "time_per_iteration": 2.650691509246826 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107097, "balance_loss_mlp": 1.06155276, "diversity_loss_mlp": 0.0, "epoch": 0.6313966910350135, "flos": 531191499264.0, "grad_norm": 0.06939094759952598, "language_loss": 0.88689077, "learning_rate": 0.0003159493103033936, "loss": 0.89760047, "num_input_tokens_seen": 273295872, "router_z_loss_mlp": 0.09417725, "routerloss_mlp": 0.0, "step": 3282, "time_per_iteration": 2.589892864227295 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01022479, "balance_loss_mlp": 1.0175674, "diversity_loss_mlp": 0.0, "epoch": 0.631589072720277, "flos": 1379887529472.0, "grad_norm": 0.015595592818812096, "language_loss": 0.79919052, "learning_rate": 0.00031565967847710564, "loss": 0.80941534, "num_input_tokens_seen": 273524320, "router_z_loss_mlp": 0.04907227, "routerloss_mlp": 0.0, "step": 3283, "time_per_iteration": 4.845726728439331 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063188, "balance_loss_mlp": 1.05360401, "diversity_loss_mlp": 0.0, "epoch": 0.6317814544055406, "flos": 624677432832.0, "grad_norm": 0.08266858178450832, "language_loss": 0.82553136, "learning_rate": 0.0003153701182180776, "loss": 0.83616328, "num_input_tokens_seen": 273598544, "router_z_loss_mlp": 0.09570312, "routerloss_mlp": 0.0, "step": 3284, "time_per_iteration": 2.783351421356201 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065881, "balance_loss_mlp": 1.05632019, "diversity_loss_mlp": 0.0, "epoch": 0.6319738360908042, "flos": 498119569920.0, "grad_norm": 0.063758085961612, "language_loss": 0.81699741, "learning_rate": 0.00031508062963872655, "loss": 0.82765627, "num_input_tokens_seen": 273666000, "router_z_loss_mlp": 0.09558105, "routerloss_mlp": 0.0, "step": 3285, "time_per_iteration": 2.5591769218444824 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064029, "balance_loss_mlp": 1.05435503, "diversity_loss_mlp": 0.0, "epoch": 0.6321662177760677, "flos": 579760353792.0, "grad_norm": 0.06946286940388995, "language_loss": 0.79716074, "learning_rate": 0.0003147912128514423, "loss": 0.80780101, "num_input_tokens_seen": 273742672, "router_z_loss_mlp": 0.09667969, "routerloss_mlp": 0.0, "step": 3286, "time_per_iteration": 2.7374072074890137 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00792206, "balance_loss_mlp": 1.3388809, "diversity_loss_mlp": 0.2218435, "epoch": 0.6323585994613313, "flos": 601486373376.0, "grad_norm": 0.030646294163886513, "language_loss": 0.87300044, "learning_rate": 0.0003145018679685859, "loss": 0.8809225, "num_input_tokens_seen": 273813984, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01184397, "step": 3287, "time_per_iteration": 2.7549750804901123 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067783, "balance_loss_mlp": 1.05837727, "diversity_loss_mlp": 0.0, "epoch": 0.6325509811465948, "flos": 528535259136.0, "grad_norm": 0.05105189166461937, "language_loss": 0.87830782, "learning_rate": 0.00031421259510249134, "loss": 0.88898563, "num_input_tokens_seen": 273892848, "router_z_loss_mlp": 0.09405518, "routerloss_mlp": 0.0, "step": 3288, "time_per_iteration": 2.7835381031036377 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067714, "balance_loss_mlp": 1.05796242, "diversity_loss_mlp": 0.0, "epoch": 0.6327433628318584, "flos": 574262866944.0, "grad_norm": 0.136960350782239, "language_loss": 0.81129575, "learning_rate": 0.00031392339436546414, "loss": 0.82197285, "num_input_tokens_seen": 273971696, "router_z_loss_mlp": 0.09747314, "routerloss_mlp": 0.0, "step": 3289, "time_per_iteration": 2.8133864402770996 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069758, "balance_loss_mlp": 1.05946374, "diversity_loss_mlp": 0.0, "epoch": 0.632935744517122, "flos": 517088033280.0, "grad_norm": 0.0683406709240254, "language_loss": 0.8385359, "learning_rate": 0.00031363426586978205, "loss": 0.84923339, "num_input_tokens_seen": 274048096, "router_z_loss_mlp": 0.10296631, "routerloss_mlp": 0.0, "step": 3290, "time_per_iteration": 2.7862977981567383 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070784, "balance_loss_mlp": 1.06093121, "diversity_loss_mlp": 0.0, "epoch": 0.6331281262023856, "flos": 617462714880.0, "grad_norm": 0.06517080869241837, "language_loss": 0.84541273, "learning_rate": 0.0003133452097276947, "loss": 0.85612059, "num_input_tokens_seen": 274122848, "router_z_loss_mlp": 0.09844971, "routerloss_mlp": 0.0, "step": 3291, "time_per_iteration": 2.735102415084839 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063814, "balance_loss_mlp": 1.05341327, "diversity_loss_mlp": 0.0, "epoch": 0.633320507887649, "flos": 592954546176.0, "grad_norm": 0.06655999718782692, "language_loss": 0.8441304, "learning_rate": 0.0003130562260514238, "loss": 0.85476851, "num_input_tokens_seen": 274198320, "router_z_loss_mlp": 0.10400391, "routerloss_mlp": 0.0, "step": 3292, "time_per_iteration": 2.7411108016967773 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067846, "balance_loss_mlp": 1.05757022, "diversity_loss_mlp": 0.0, "epoch": 0.6335128895729126, "flos": 582349782528.0, "grad_norm": 0.05657366074496326, "language_loss": 0.81691957, "learning_rate": 0.0003127673149531626, "loss": 0.82759798, "num_input_tokens_seen": 274274944, "router_z_loss_mlp": 0.1027832, "routerloss_mlp": 0.0, "step": 3293, "time_per_iteration": 2.766249418258667 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066103, "balance_loss_mlp": 1.05568373, "diversity_loss_mlp": 0.0, "epoch": 0.6337052712581762, "flos": 453036934656.0, "grad_norm": 0.0752121645824798, "language_loss": 0.83436191, "learning_rate": 0.0003124784765450762, "loss": 0.84502298, "num_input_tokens_seen": 274342384, "router_z_loss_mlp": 0.10418701, "routerloss_mlp": 0.0, "step": 3294, "time_per_iteration": 2.5490550994873047 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066062, "balance_loss_mlp": 1.05569124, "diversity_loss_mlp": 0.0, "epoch": 0.6338976529434398, "flos": 573407921664.0, "grad_norm": 0.06917813795445459, "language_loss": 0.797925, "learning_rate": 0.0003121897109393017, "loss": 0.80858564, "num_input_tokens_seen": 274417568, "router_z_loss_mlp": 0.10375977, "routerloss_mlp": 0.0, "step": 3295, "time_per_iteration": 2.779365062713623 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061765, "balance_loss_mlp": 1.05135238, "diversity_loss_mlp": 0.0, "epoch": 0.6340900346287034, "flos": 508758838272.0, "grad_norm": 0.06234951999103671, "language_loss": 0.89289808, "learning_rate": 0.0003119010182479481, "loss": 0.9035157, "num_input_tokens_seen": 274488960, "router_z_loss_mlp": 0.10418701, "routerloss_mlp": 0.0, "step": 3296, "time_per_iteration": 2.6138393878936768 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069092, "balance_loss_mlp": 1.05855989, "diversity_loss_mlp": 0.0, "epoch": 0.6342824163139669, "flos": 479746520064.0, "grad_norm": 0.06350246507064496, "language_loss": 0.82675922, "learning_rate": 0.00031161239858309563, "loss": 0.83745015, "num_input_tokens_seen": 274556880, "router_z_loss_mlp": 0.10540771, "routerloss_mlp": 0.0, "step": 3297, "time_per_iteration": 2.586970329284668 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072163, "balance_loss_mlp": 1.06148767, "diversity_loss_mlp": 0.0, "epoch": 0.6344747979992305, "flos": 572031714816.0, "grad_norm": 0.0696399427467901, "language_loss": 0.83455825, "learning_rate": 0.0003113238520567964, "loss": 0.84527981, "num_input_tokens_seen": 274624944, "router_z_loss_mlp": 0.10681152, "routerloss_mlp": 0.0, "step": 3298, "time_per_iteration": 2.6586110591888428 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065276, "balance_loss_mlp": 1.05495286, "diversity_loss_mlp": 0.0, "epoch": 0.634667179684494, "flos": 605911601664.0, "grad_norm": 0.07177816314390054, "language_loss": 0.81584775, "learning_rate": 0.00031103537878107403, "loss": 0.82650054, "num_input_tokens_seen": 274695152, "router_z_loss_mlp": 0.10321045, "routerloss_mlp": 0.0, "step": 3299, "time_per_iteration": 2.708526372909546 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106763, "balance_loss_mlp": 1.05756879, "diversity_loss_mlp": 0.0, "epoch": 0.6348595613697576, "flos": 646944537600.0, "grad_norm": 0.0821312661024272, "language_loss": 0.7999661, "learning_rate": 0.0003107469788679238, "loss": 0.81064236, "num_input_tokens_seen": 274767840, "router_z_loss_mlp": 0.10064697, "routerloss_mlp": 0.0, "step": 3300, "time_per_iteration": 2.774571180343628 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070552, "balance_loss_mlp": 1.06004977, "diversity_loss_mlp": 0.0, "epoch": 0.6350519430550212, "flos": 639074935296.0, "grad_norm": 0.06269586290013059, "language_loss": 0.86672354, "learning_rate": 0.00031045865242931267, "loss": 0.87742901, "num_input_tokens_seen": 274839312, "router_z_loss_mlp": 0.10498047, "routerloss_mlp": 0.0, "step": 3301, "time_per_iteration": 2.800271987915039 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075539, "balance_loss_mlp": 1.06537664, "diversity_loss_mlp": 0.0, "epoch": 0.6352443247402847, "flos": 686437908480.0, "grad_norm": 0.060025608417058285, "language_loss": 0.83086729, "learning_rate": 0.00031017039957717877, "loss": 0.84162271, "num_input_tokens_seen": 274922704, "router_z_loss_mlp": 0.10162354, "routerloss_mlp": 0.0, "step": 3302, "time_per_iteration": 2.99652361869812 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083489, "balance_loss_mlp": 1.07342744, "diversity_loss_mlp": 0.0, "epoch": 0.6354367064255483, "flos": 559442847744.0, "grad_norm": 0.0673613891994724, "language_loss": 0.89035141, "learning_rate": 0.0003098822204234318, "loss": 0.90118629, "num_input_tokens_seen": 274992848, "router_z_loss_mlp": 0.10064697, "routerloss_mlp": 0.0, "step": 3303, "time_per_iteration": 2.6769609451293945 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076148, "balance_loss_mlp": 1.06632543, "diversity_loss_mlp": 0.0, "epoch": 0.6356290881108119, "flos": 979487520768.0, "grad_norm": 0.0682411238472533, "language_loss": 0.87294948, "learning_rate": 0.00030959411507995273, "loss": 0.88371098, "num_input_tokens_seen": 275071456, "router_z_loss_mlp": 0.09814453, "routerloss_mlp": 0.0, "step": 3304, "time_per_iteration": 3.25303053855896 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073289, "balance_loss_mlp": 1.06334674, "diversity_loss_mlp": 0.0, "epoch": 0.6358214697960755, "flos": 528278298624.0, "grad_norm": 0.09293144525754729, "language_loss": 0.80997777, "learning_rate": 0.00030930608365859407, "loss": 0.82071066, "num_input_tokens_seen": 275140512, "router_z_loss_mlp": 0.09942627, "routerloss_mlp": 0.0, "step": 3305, "time_per_iteration": 2.650047540664673 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079087, "balance_loss_mlp": 1.06908488, "diversity_loss_mlp": 0.0, "epoch": 0.6360138514813389, "flos": 516811249152.0, "grad_norm": 0.06298630616486185, "language_loss": 0.87762672, "learning_rate": 0.00030901812627117943, "loss": 0.8884176, "num_input_tokens_seen": 275210896, "router_z_loss_mlp": 0.10003662, "routerloss_mlp": 0.0, "step": 3306, "time_per_iteration": 2.605576276779175 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106604, "balance_loss_mlp": 1.05617523, "diversity_loss_mlp": 0.0, "epoch": 0.6362062331666025, "flos": 466525163520.0, "grad_norm": 0.09439685712352788, "language_loss": 0.8446157, "learning_rate": 0.000308730243029504, "loss": 0.85527611, "num_input_tokens_seen": 275279888, "router_z_loss_mlp": 0.09857178, "routerloss_mlp": 0.0, "step": 3307, "time_per_iteration": 2.6111857891082764 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070403, "balance_loss_mlp": 1.06070554, "diversity_loss_mlp": 0.0, "epoch": 0.6363986148518661, "flos": 549720193536.0, "grad_norm": 0.06852736886674453, "language_loss": 0.7914747, "learning_rate": 0.0003084424340453339, "loss": 0.80217868, "num_input_tokens_seen": 275357056, "router_z_loss_mlp": 0.09686279, "routerloss_mlp": 0.0, "step": 3308, "time_per_iteration": 2.8072149753570557 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063715, "balance_loss_mlp": 1.05379033, "diversity_loss_mlp": 0.0, "epoch": 0.6365909965371297, "flos": 583049083392.0, "grad_norm": 0.0739185528440478, "language_loss": 0.82162523, "learning_rate": 0.0003081546994304064, "loss": 0.8322624, "num_input_tokens_seen": 275428240, "router_z_loss_mlp": 0.09918213, "routerloss_mlp": 0.0, "step": 3309, "time_per_iteration": 2.7670769691467285 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059397, "balance_loss_mlp": 1.04971123, "diversity_loss_mlp": 0.0, "epoch": 0.6367833782223933, "flos": 531255739392.0, "grad_norm": 0.07802596117693822, "language_loss": 0.81907165, "learning_rate": 0.0003078670392964298, "loss": 0.82966554, "num_input_tokens_seen": 275497568, "router_z_loss_mlp": 0.09680176, "routerloss_mlp": 0.0, "step": 3310, "time_per_iteration": 2.6474099159240723 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058905, "balance_loss_mlp": 1.04899311, "diversity_loss_mlp": 0.0, "epoch": 0.6369757599076568, "flos": 569506526208.0, "grad_norm": 0.0731557233203608, "language_loss": 0.82997435, "learning_rate": 0.00030757945375508406, "loss": 0.84056342, "num_input_tokens_seen": 275569616, "router_z_loss_mlp": 0.09906006, "routerloss_mlp": 0.0, "step": 3311, "time_per_iteration": 2.6429851055145264 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054164, "balance_loss_mlp": 1.04434729, "diversity_loss_mlp": 0.0, "epoch": 0.6371681415929203, "flos": 539957892096.0, "grad_norm": 0.06845871409018763, "language_loss": 0.81414253, "learning_rate": 0.00030729194291801944, "loss": 0.8246842, "num_input_tokens_seen": 275641408, "router_z_loss_mlp": 0.0980835, "routerloss_mlp": 0.0, "step": 3312, "time_per_iteration": 2.6631555557250977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105661, "balance_loss_mlp": 1.04690671, "diversity_loss_mlp": 0.0, "epoch": 0.6373605232781839, "flos": 483566423040.0, "grad_norm": 0.08097298950364754, "language_loss": 0.77058214, "learning_rate": 0.00030700450689685787, "loss": 0.78114825, "num_input_tokens_seen": 275706608, "router_z_loss_mlp": 0.09698486, "routerloss_mlp": 0.0, "step": 3313, "time_per_iteration": 2.540600061416626 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059608, "balance_loss_mlp": 1.0500232, "diversity_loss_mlp": 0.0, "epoch": 0.6375529049634475, "flos": 578581636608.0, "grad_norm": 0.0804877394257798, "language_loss": 0.85728467, "learning_rate": 0.00030671714580319186, "loss": 0.86788076, "num_input_tokens_seen": 275785952, "router_z_loss_mlp": 0.0958252, "routerloss_mlp": 0.0, "step": 3314, "time_per_iteration": 2.804875135421753 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055412, "balance_loss_mlp": 1.04565513, "diversity_loss_mlp": 0.0, "epoch": 0.637745286648711, "flos": 682257530880.0, "grad_norm": 0.07597136338877614, "language_loss": 0.83442312, "learning_rate": 0.0003064298597485846, "loss": 0.84497726, "num_input_tokens_seen": 275866240, "router_z_loss_mlp": 0.09747314, "routerloss_mlp": 0.0, "step": 3315, "time_per_iteration": 2.860419273376465 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010587, "balance_loss_mlp": 1.04858494, "diversity_loss_mlp": 0.0, "epoch": 0.6379376683339746, "flos": 504637558272.0, "grad_norm": 0.06770078099501715, "language_loss": 0.83771706, "learning_rate": 0.00030614264884457054, "loss": 0.84830409, "num_input_tokens_seen": 275936176, "router_z_loss_mlp": 0.10113525, "routerloss_mlp": 0.0, "step": 3316, "time_per_iteration": 2.6398963928222656 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054628, "balance_loss_mlp": 1.04450154, "diversity_loss_mlp": 0.0, "epoch": 0.6381300500192382, "flos": 502020965376.0, "grad_norm": 0.09575765703427323, "language_loss": 0.77156532, "learning_rate": 0.000305855513202655, "loss": 0.78211164, "num_input_tokens_seen": 276004608, "router_z_loss_mlp": 0.10125732, "routerloss_mlp": 0.0, "step": 3317, "time_per_iteration": 2.57024884223938 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052471, "balance_loss_mlp": 1.04220688, "diversity_loss_mlp": 0.0, "epoch": 0.6383224317045018, "flos": 400489961472.0, "grad_norm": 0.07693758647747995, "language_loss": 0.77392501, "learning_rate": 0.0003055684529343138, "loss": 0.7844497, "num_input_tokens_seen": 276066688, "router_z_loss_mlp": 0.10266113, "routerloss_mlp": 0.0, "step": 3318, "time_per_iteration": 2.4296517372131348 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058636, "balance_loss_mlp": 1.04889059, "diversity_loss_mlp": 0.0, "epoch": 0.6385148133897653, "flos": 499377208320.0, "grad_norm": 0.08157026730411542, "language_loss": 0.78901523, "learning_rate": 0.00030528146815099374, "loss": 0.79960155, "num_input_tokens_seen": 276140000, "router_z_loss_mlp": 0.09741211, "routerloss_mlp": 0.0, "step": 3319, "time_per_iteration": 2.6178040504455566 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105942, "balance_loss_mlp": 1.0495379, "diversity_loss_mlp": 0.0, "epoch": 0.6387071950750288, "flos": 527665632768.0, "grad_norm": 0.05929975411068792, "language_loss": 0.72059178, "learning_rate": 0.00030499455896411203, "loss": 0.73118603, "num_input_tokens_seen": 276209840, "router_z_loss_mlp": 0.09875488, "routerloss_mlp": 0.0, "step": 3320, "time_per_iteration": 2.627962589263916 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01026073, "balance_loss_mlp": 1.02049422, "diversity_loss_mlp": 0.0, "epoch": 0.6388995767602924, "flos": 1455979069440.0, "grad_norm": 0.01967957525447477, "language_loss": 0.76300812, "learning_rate": 0.0003047077254850568, "loss": 0.77326888, "num_input_tokens_seen": 276444784, "router_z_loss_mlp": 0.0559082, "routerloss_mlp": 0.0, "step": 3321, "time_per_iteration": 4.926000595092773 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068116, "balance_loss_mlp": 1.05800068, "diversity_loss_mlp": 0.0, "epoch": 0.639091958445556, "flos": 603895191552.0, "grad_norm": 0.06833251339694629, "language_loss": 0.76524007, "learning_rate": 0.0003044209678251865, "loss": 0.77592129, "num_input_tokens_seen": 276522768, "router_z_loss_mlp": 0.10107422, "routerloss_mlp": 0.0, "step": 3322, "time_per_iteration": 2.916396379470825 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066594, "balance_loss_mlp": 1.05691469, "diversity_loss_mlp": 0.0, "epoch": 0.6392843401308196, "flos": 584516694528.0, "grad_norm": 0.05729140281605497, "language_loss": 0.84366953, "learning_rate": 0.0003041342860958306, "loss": 0.85433549, "num_input_tokens_seen": 276597104, "router_z_loss_mlp": 0.09674072, "routerloss_mlp": 0.0, "step": 3323, "time_per_iteration": 2.7770862579345703 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071346, "balance_loss_mlp": 1.06162453, "diversity_loss_mlp": 0.0, "epoch": 0.6394767218160831, "flos": 514681413120.0, "grad_norm": 0.08519156923386062, "language_loss": 0.91346496, "learning_rate": 0.00030384768040828857, "loss": 0.92417842, "num_input_tokens_seen": 276670256, "router_z_loss_mlp": 0.09716797, "routerloss_mlp": 0.0, "step": 3324, "time_per_iteration": 2.6812171936035156 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081336, "balance_loss_mlp": 1.07172787, "diversity_loss_mlp": 0.0, "epoch": 0.6396691035013466, "flos": 541732022784.0, "grad_norm": 0.07651235317530308, "language_loss": 0.85160887, "learning_rate": 0.00030356115087383094, "loss": 0.86242223, "num_input_tokens_seen": 276737680, "router_z_loss_mlp": 0.0960083, "routerloss_mlp": 0.0, "step": 3325, "time_per_iteration": 2.6458263397216797 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00811228, "balance_loss_mlp": 1.37989581, "diversity_loss_mlp": 0.21910624, "epoch": 0.6398614851866102, "flos": 525535796736.0, "grad_norm": 0.034032588306098184, "language_loss": 0.8530367, "learning_rate": 0.00030327469760369803, "loss": 0.86114895, "num_input_tokens_seen": 276803808, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01172681, "step": 3326, "time_per_iteration": 2.6054904460906982 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075022, "balance_loss_mlp": 1.06528878, "diversity_loss_mlp": 0.0, "epoch": 0.6400538668718738, "flos": 622989937152.0, "grad_norm": 0.06651858881657381, "language_loss": 0.84802389, "learning_rate": 0.0003029883207091009, "loss": 0.85877407, "num_input_tokens_seen": 276874752, "router_z_loss_mlp": 0.097229, "routerloss_mlp": 0.0, "step": 3327, "time_per_iteration": 2.7084085941314697 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075174, "balance_loss_mlp": 1.06530905, "diversity_loss_mlp": 0.0, "epoch": 0.6402462485571374, "flos": 503367436800.0, "grad_norm": 0.07064025062286232, "language_loss": 0.78362405, "learning_rate": 0.00030270202030122095, "loss": 0.79437578, "num_input_tokens_seen": 276947200, "router_z_loss_mlp": 0.09857178, "routerloss_mlp": 0.0, "step": 3328, "time_per_iteration": 2.668501615524292 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076287, "balance_loss_mlp": 1.06659508, "diversity_loss_mlp": 0.0, "epoch": 0.6404386302424009, "flos": 819247260672.0, "grad_norm": 0.07541554155703202, "language_loss": 0.85661519, "learning_rate": 0.00030241579649121, "loss": 0.867378, "num_input_tokens_seen": 277025712, "router_z_loss_mlp": 0.09680176, "routerloss_mlp": 0.0, "step": 3329, "time_per_iteration": 2.9972317218780518 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107413, "balance_loss_mlp": 1.06488538, "diversity_loss_mlp": 0.0, "epoch": 0.6406310119276645, "flos": 471812677632.0, "grad_norm": 0.06439571325368963, "language_loss": 0.7957617, "learning_rate": 0.00030212964939018994, "loss": 0.806503, "num_input_tokens_seen": 277091264, "router_z_loss_mlp": 0.09234619, "routerloss_mlp": 0.0, "step": 3330, "time_per_iteration": 2.5598840713500977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075671, "balance_loss_mlp": 1.06651545, "diversity_loss_mlp": 0.0, "epoch": 0.6408233936129281, "flos": 425583631872.0, "grad_norm": 0.07958558119065547, "language_loss": 0.85401917, "learning_rate": 0.0003018435791092527, "loss": 0.8647759, "num_input_tokens_seen": 277154608, "router_z_loss_mlp": 0.09155273, "routerloss_mlp": 0.0, "step": 3331, "time_per_iteration": 2.4886720180511475 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077198, "balance_loss_mlp": 1.06757176, "diversity_loss_mlp": 0.0, "epoch": 0.6410157752981916, "flos": 549784433664.0, "grad_norm": 0.08502928683846613, "language_loss": 0.80926251, "learning_rate": 0.00030155758575946083, "loss": 0.8200345, "num_input_tokens_seen": 277222176, "router_z_loss_mlp": 0.09637451, "routerloss_mlp": 0.0, "step": 3332, "time_per_iteration": 2.661039113998413 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073982, "balance_loss_mlp": 1.06464815, "diversity_loss_mlp": 0.0, "epoch": 0.6412081569834551, "flos": 475899452928.0, "grad_norm": 0.07641451366860309, "language_loss": 0.84045428, "learning_rate": 0.0003012716694518467, "loss": 0.85119408, "num_input_tokens_seen": 277289600, "router_z_loss_mlp": 0.09332275, "routerloss_mlp": 0.0, "step": 3333, "time_per_iteration": 2.579451322555542 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074146, "balance_loss_mlp": 1.06456161, "diversity_loss_mlp": 0.0, "epoch": 0.6414005386687187, "flos": 540921494016.0, "grad_norm": 0.06148329614598223, "language_loss": 0.85011578, "learning_rate": 0.000300985830297413, "loss": 0.86085725, "num_input_tokens_seen": 277362784, "router_z_loss_mlp": 0.09576416, "routerloss_mlp": 0.0, "step": 3334, "time_per_iteration": 2.6951658725738525 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070237, "balance_loss_mlp": 1.0607183, "diversity_loss_mlp": 0.0, "epoch": 0.6415929203539823, "flos": 1041317379072.0, "grad_norm": 0.07715385519242493, "language_loss": 0.8765533, "learning_rate": 0.00030070006840713205, "loss": 0.88725567, "num_input_tokens_seen": 277449728, "router_z_loss_mlp": 0.09509277, "routerloss_mlp": 0.0, "step": 3335, "time_per_iteration": 3.415095329284668 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068996, "balance_loss_mlp": 1.05956614, "diversity_loss_mlp": 0.0, "epoch": 0.6417853020392459, "flos": 648337996800.0, "grad_norm": 0.06540243812784874, "language_loss": 0.73462147, "learning_rate": 0.000300414383891947, "loss": 0.74531144, "num_input_tokens_seen": 277527552, "router_z_loss_mlp": 0.09423828, "routerloss_mlp": 0.0, "step": 3336, "time_per_iteration": 2.8207781314849854 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070682, "balance_loss_mlp": 1.06142569, "diversity_loss_mlp": 0.0, "epoch": 0.6419776837245095, "flos": 500899147776.0, "grad_norm": 0.062126831222401244, "language_loss": 0.88856506, "learning_rate": 0.00030012877686276973, "loss": 0.89927197, "num_input_tokens_seen": 277603568, "router_z_loss_mlp": 0.0925293, "routerloss_mlp": 0.0, "step": 3337, "time_per_iteration": 2.701467752456665 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070599, "balance_loss_mlp": 1.06103206, "diversity_loss_mlp": 0.0, "epoch": 0.642170065409773, "flos": 620620392960.0, "grad_norm": 0.06622404014204096, "language_loss": 0.86998606, "learning_rate": 0.0002998432474304832, "loss": 0.88069206, "num_input_tokens_seen": 277679696, "router_z_loss_mlp": 0.09564209, "routerloss_mlp": 0.0, "step": 3338, "time_per_iteration": 2.754462242126465 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01023208, "balance_loss_mlp": 1.01724732, "diversity_loss_mlp": 0.0, "epoch": 0.6423624470950365, "flos": 1423539629568.0, "grad_norm": 0.025409804512754288, "language_loss": 0.79237342, "learning_rate": 0.0002995577957059395, "loss": 0.80260551, "num_input_tokens_seen": 277913056, "router_z_loss_mlp": 0.05957031, "routerloss_mlp": 0.0, "step": 3339, "time_per_iteration": 4.871408700942993 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061344, "balance_loss_mlp": 1.05190849, "diversity_loss_mlp": 0.0, "epoch": 0.6425548287803001, "flos": 562353477120.0, "grad_norm": 0.056182904751461135, "language_loss": 0.88884711, "learning_rate": 0.00029927242179996107, "loss": 0.89946061, "num_input_tokens_seen": 277983168, "router_z_loss_mlp": 0.09436035, "routerloss_mlp": 0.0, "step": 3340, "time_per_iteration": 2.6943204402923584 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063875, "balance_loss_mlp": 1.05451107, "diversity_loss_mlp": 0.0, "epoch": 0.6427472104655637, "flos": 585443220480.0, "grad_norm": 0.05740093819519034, "language_loss": 0.83547878, "learning_rate": 0.0002989871258233398, "loss": 0.8461175, "num_input_tokens_seen": 278057600, "router_z_loss_mlp": 0.09356689, "routerloss_mlp": 0.0, "step": 3341, "time_per_iteration": 2.759075164794922 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106288, "balance_loss_mlp": 1.05317652, "diversity_loss_mlp": 0.0, "epoch": 0.6429395921508272, "flos": 404282700288.0, "grad_norm": 0.08495529058707293, "language_loss": 0.82866132, "learning_rate": 0.0002987019078868373, "loss": 0.83929014, "num_input_tokens_seen": 278119232, "router_z_loss_mlp": 0.0970459, "routerloss_mlp": 0.0, "step": 3342, "time_per_iteration": 2.460184097290039 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00806137, "balance_loss_mlp": 1.3687458, "diversity_loss_mlp": 0.21894245, "epoch": 0.6431319738360908, "flos": 548783755776.0, "grad_norm": 0.03059825895364693, "language_loss": 0.81932986, "learning_rate": 0.00029841676810118484, "loss": 0.82739115, "num_input_tokens_seen": 278187456, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01229309, "step": 3343, "time_per_iteration": 2.6885409355163574 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058763, "balance_loss_mlp": 1.04915428, "diversity_loss_mlp": 0.0, "epoch": 0.6433243555213544, "flos": 793375368192.0, "grad_norm": 0.0604476685897385, "language_loss": 0.87177467, "learning_rate": 0.0002981317065770839, "loss": 0.88236231, "num_input_tokens_seen": 278262176, "router_z_loss_mlp": 0.09594727, "routerloss_mlp": 0.0, "step": 3344, "time_per_iteration": 3.03983736038208 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060176, "balance_loss_mlp": 1.05044222, "diversity_loss_mlp": 0.0, "epoch": 0.643516737206618, "flos": 583031831040.0, "grad_norm": 0.07704872008291591, "language_loss": 0.8078779, "learning_rate": 0.00029784672342520493, "loss": 0.81847966, "num_input_tokens_seen": 278328816, "router_z_loss_mlp": 0.097229, "routerloss_mlp": 0.0, "step": 3345, "time_per_iteration": 2.6846296787261963 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061675, "balance_loss_mlp": 1.05220366, "diversity_loss_mlp": 0.0, "epoch": 0.6437091188918815, "flos": 518750936064.0, "grad_norm": 0.06975007259690363, "language_loss": 0.8341136, "learning_rate": 0.00029756181875618834, "loss": 0.84473026, "num_input_tokens_seen": 278395824, "router_z_loss_mlp": 0.09472656, "routerloss_mlp": 0.0, "step": 3346, "time_per_iteration": 2.5665693283081055 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00808422, "balance_loss_mlp": 1.37269104, "diversity_loss_mlp": 0.21939373, "epoch": 0.643901500577145, "flos": 384946048512.0, "grad_norm": 0.035494504018204545, "language_loss": 0.83294541, "learning_rate": 0.0002972769926806439, "loss": 0.84102958, "num_input_tokens_seen": 278457696, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0123796, "step": 3347, "time_per_iteration": 2.504934549331665 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0080263, "balance_loss_mlp": 1.36098909, "diversity_loss_mlp": 0.21952364, "epoch": 0.6440938822624086, "flos": 483722067456.0, "grad_norm": 0.0334865497392214, "language_loss": 0.88848293, "learning_rate": 0.0002969922453091508, "loss": 0.89650929, "num_input_tokens_seen": 278526992, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01237371, "step": 3348, "time_per_iteration": 2.588092803955078 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105701, "balance_loss_mlp": 1.04741955, "diversity_loss_mlp": 0.0, "epoch": 0.6442862639476722, "flos": 540469241856.0, "grad_norm": 0.07081599083542611, "language_loss": 0.85229504, "learning_rate": 0.00029670757675225777, "loss": 0.86286509, "num_input_tokens_seen": 278601120, "router_z_loss_mlp": 0.09588623, "routerloss_mlp": 0.0, "step": 3349, "time_per_iteration": 2.7467896938323975 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056774, "balance_loss_mlp": 1.04726744, "diversity_loss_mlp": 0.0, "epoch": 0.6444786456329358, "flos": 526912003584.0, "grad_norm": 0.08621507866757971, "language_loss": 0.79660463, "learning_rate": 0.0002964229871204831, "loss": 0.80717242, "num_input_tokens_seen": 278668208, "router_z_loss_mlp": 0.09490967, "routerloss_mlp": 0.0, "step": 3350, "time_per_iteration": 2.65602707862854 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056473, "balance_loss_mlp": 1.04715693, "diversity_loss_mlp": 0.0, "epoch": 0.6446710273181993, "flos": 697892848128.0, "grad_norm": 0.0705050991392221, "language_loss": 0.83769023, "learning_rate": 0.00029613847652431403, "loss": 0.84825498, "num_input_tokens_seen": 278742832, "router_z_loss_mlp": 0.09301758, "routerloss_mlp": 0.0, "step": 3351, "time_per_iteration": 2.8451104164123535 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00797485, "balance_loss_mlp": 1.35163832, "diversity_loss_mlp": 0.21852379, "epoch": 0.6448634090034628, "flos": 625023226368.0, "grad_norm": 0.02943697991412704, "language_loss": 0.79510611, "learning_rate": 0.0002958540450742078, "loss": 0.80308104, "num_input_tokens_seen": 278829744, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01240353, "step": 3352, "time_per_iteration": 2.950679063796997 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060228, "balance_loss_mlp": 1.05063784, "diversity_loss_mlp": 0.0, "epoch": 0.6450557906887264, "flos": 600950057472.0, "grad_norm": 0.06852868488451136, "language_loss": 0.7732749, "learning_rate": 0.0002955696928805901, "loss": 0.78387713, "num_input_tokens_seen": 278908592, "router_z_loss_mlp": 0.0958252, "routerloss_mlp": 0.0, "step": 3353, "time_per_iteration": 2.8771724700927734 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067774, "balance_loss_mlp": 1.0582372, "diversity_loss_mlp": 0.0, "epoch": 0.64524817237399, "flos": 646200820224.0, "grad_norm": 0.10704512558750189, "language_loss": 0.86111909, "learning_rate": 0.0002952854200538563, "loss": 0.87179685, "num_input_tokens_seen": 278986960, "router_z_loss_mlp": 0.09527588, "routerloss_mlp": 0.0, "step": 3354, "time_per_iteration": 2.777782917022705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00798015, "balance_loss_mlp": 1.35377836, "diversity_loss_mlp": 0.21820019, "epoch": 0.6454405540592536, "flos": 473411340288.0, "grad_norm": 0.032699702246912744, "language_loss": 0.82167614, "learning_rate": 0.000295001226704371, "loss": 0.82965624, "num_input_tokens_seen": 279054896, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01202584, "step": 3355, "time_per_iteration": 2.5991604328155518 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061481, "balance_loss_mlp": 1.05207551, "diversity_loss_mlp": 0.0, "epoch": 0.6456329357445171, "flos": 611841517056.0, "grad_norm": 0.07645377110954561, "language_loss": 0.82891458, "learning_rate": 0.00029471711294246783, "loss": 0.8395294, "num_input_tokens_seen": 279126816, "router_z_loss_mlp": 0.09399414, "routerloss_mlp": 0.0, "step": 3356, "time_per_iteration": 2.8146939277648926 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064507, "balance_loss_mlp": 1.05512571, "diversity_loss_mlp": 0.0, "epoch": 0.6458253174297807, "flos": 731683901952.0, "grad_norm": 0.07650305014050414, "language_loss": 0.82254899, "learning_rate": 0.0002944330788784494, "loss": 0.83319402, "num_input_tokens_seen": 279197552, "router_z_loss_mlp": 0.09381104, "routerloss_mlp": 0.0, "step": 3357, "time_per_iteration": 2.90537428855896 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106479, "balance_loss_mlp": 1.05508041, "diversity_loss_mlp": 0.0, "epoch": 0.6460176991150443, "flos": 570413228544.0, "grad_norm": 0.06168723315149378, "language_loss": 0.84662282, "learning_rate": 0.00029414912462258786, "loss": 0.85727078, "num_input_tokens_seen": 279275440, "router_z_loss_mlp": 0.0970459, "routerloss_mlp": 0.0, "step": 3358, "time_per_iteration": 2.8301830291748047 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068597, "balance_loss_mlp": 1.05873299, "diversity_loss_mlp": 0.0, "epoch": 0.6462100808003078, "flos": 583160311296.0, "grad_norm": 0.07109215771884392, "language_loss": 0.81651056, "learning_rate": 0.00029386525028512366, "loss": 0.8271966, "num_input_tokens_seen": 279349168, "router_z_loss_mlp": 0.09857178, "routerloss_mlp": 0.0, "step": 3359, "time_per_iteration": 2.689298152923584 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068602, "balance_loss_mlp": 1.05881464, "diversity_loss_mlp": 0.0, "epoch": 0.6464024624855714, "flos": 483919557120.0, "grad_norm": 0.0690455154627963, "language_loss": 0.86761546, "learning_rate": 0.0002935814559762666, "loss": 0.8783015, "num_input_tokens_seen": 279427600, "router_z_loss_mlp": 0.09777832, "routerloss_mlp": 0.0, "step": 3360, "time_per_iteration": 2.820415496826172 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072441, "balance_loss_mlp": 1.06286263, "diversity_loss_mlp": 0.0, "epoch": 0.6465948441708349, "flos": 527774289408.0, "grad_norm": 0.06340694058104589, "language_loss": 0.7940557, "learning_rate": 0.0002932977418061957, "loss": 0.80478007, "num_input_tokens_seen": 279496608, "router_z_loss_mlp": 0.09576416, "routerloss_mlp": 0.0, "step": 3361, "time_per_iteration": 2.638246536254883 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075754, "balance_loss_mlp": 1.06592488, "diversity_loss_mlp": 0.0, "epoch": 0.6467872258560985, "flos": 669421615104.0, "grad_norm": 0.11078731162526398, "language_loss": 0.80980253, "learning_rate": 0.00029301410788505833, "loss": 0.82056004, "num_input_tokens_seen": 279568448, "router_z_loss_mlp": 0.0982666, "routerloss_mlp": 0.0, "step": 3362, "time_per_iteration": 2.829946279525757 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067795, "balance_loss_mlp": 1.05792451, "diversity_loss_mlp": 0.0, "epoch": 0.6469796075413621, "flos": 432101620224.0, "grad_norm": 0.08350394703111745, "language_loss": 0.80845594, "learning_rate": 0.00029273055432297126, "loss": 0.81913394, "num_input_tokens_seen": 279631952, "router_z_loss_mlp": 0.09875488, "routerloss_mlp": 0.0, "step": 3363, "time_per_iteration": 2.5047130584716797 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057084, "balance_loss_mlp": 1.04717803, "diversity_loss_mlp": 0.0, "epoch": 0.6471719892266257, "flos": 803750335488.0, "grad_norm": 0.06756647759690963, "language_loss": 0.80998582, "learning_rate": 0.00029244708123001917, "loss": 0.8205567, "num_input_tokens_seen": 279706880, "router_z_loss_mlp": 0.09899902, "routerloss_mlp": 0.0, "step": 3364, "time_per_iteration": 3.071207284927368 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059298, "balance_loss_mlp": 1.04951715, "diversity_loss_mlp": 0.0, "epoch": 0.6473643709118891, "flos": 577208001024.0, "grad_norm": 0.08982319043529345, "language_loss": 0.84555328, "learning_rate": 0.0002921636887162565, "loss": 0.85614622, "num_input_tokens_seen": 279778864, "router_z_loss_mlp": 0.09771729, "routerloss_mlp": 0.0, "step": 3365, "time_per_iteration": 2.768284797668457 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057421, "balance_loss_mlp": 1.04800391, "diversity_loss_mlp": 0.0, "epoch": 0.6475567525971527, "flos": 761420113920.0, "grad_norm": 0.08629567448100454, "language_loss": 0.83712798, "learning_rate": 0.00029188037689170595, "loss": 0.84770226, "num_input_tokens_seen": 279853328, "router_z_loss_mlp": 0.09411621, "routerloss_mlp": 0.0, "step": 3366, "time_per_iteration": 2.9462075233459473 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054242, "balance_loss_mlp": 1.04440713, "diversity_loss_mlp": 0.0, "epoch": 0.6477491342824163, "flos": 843103116288.0, "grad_norm": 0.07194825267456643, "language_loss": 0.84329098, "learning_rate": 0.0002915971458663586, "loss": 0.85383338, "num_input_tokens_seen": 279928464, "router_z_loss_mlp": 0.09820557, "routerloss_mlp": 0.0, "step": 3367, "time_per_iteration": 3.052452802658081 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105506, "balance_loss_mlp": 1.04521894, "diversity_loss_mlp": 0.0, "epoch": 0.6479415159676799, "flos": 884820298752.0, "grad_norm": 0.06187590041276245, "language_loss": 0.81901962, "learning_rate": 0.00029131399575017494, "loss": 0.82957023, "num_input_tokens_seen": 280015680, "router_z_loss_mlp": 0.09838867, "routerloss_mlp": 0.0, "step": 3368, "time_per_iteration": 3.260995864868164 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054713, "balance_loss_mlp": 1.04508734, "diversity_loss_mlp": 0.0, "epoch": 0.6481338976529435, "flos": 615513116160.0, "grad_norm": 0.08987768190651603, "language_loss": 0.85898274, "learning_rate": 0.0002910309266530836, "loss": 0.8695299, "num_input_tokens_seen": 280093904, "router_z_loss_mlp": 0.09613037, "routerloss_mlp": 0.0, "step": 3369, "time_per_iteration": 2.8022115230560303 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059559, "balance_loss_mlp": 1.0497539, "diversity_loss_mlp": 0.0, "epoch": 0.648326279338207, "flos": 510009136128.0, "grad_norm": 0.07644364345836648, "language_loss": 0.8560974, "learning_rate": 0.0002907479386849814, "loss": 0.86669296, "num_input_tokens_seen": 280161584, "router_z_loss_mlp": 0.09796143, "routerloss_mlp": 0.0, "step": 3370, "time_per_iteration": 2.646334171295166 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057441, "balance_loss_mlp": 1.04791021, "diversity_loss_mlp": 0.0, "epoch": 0.6485186610234706, "flos": 702498313728.0, "grad_norm": 0.07833648604751785, "language_loss": 0.80597669, "learning_rate": 0.0002904650319557339, "loss": 0.81655109, "num_input_tokens_seen": 280248016, "router_z_loss_mlp": 0.09521484, "routerloss_mlp": 0.0, "step": 3371, "time_per_iteration": 2.9977073669433594 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00787303, "balance_loss_mlp": 1.33170056, "diversity_loss_mlp": 0.21746175, "epoch": 0.6487110427087341, "flos": 560683233792.0, "grad_norm": 0.036264020076934224, "language_loss": 0.81342006, "learning_rate": 0.0002901822065751758, "loss": 0.82129312, "num_input_tokens_seen": 280319024, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01272238, "step": 3372, "time_per_iteration": 2.697375774383545 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054945, "balance_loss_mlp": 1.04537833, "diversity_loss_mlp": 0.0, "epoch": 0.6489034243939977, "flos": 680100530688.0, "grad_norm": 0.06787352107623057, "language_loss": 0.8556366, "learning_rate": 0.0002898994626531093, "loss": 0.86618596, "num_input_tokens_seen": 280393200, "router_z_loss_mlp": 0.09564209, "routerloss_mlp": 0.0, "step": 3373, "time_per_iteration": 2.8561713695526123 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059452, "balance_loss_mlp": 1.05008769, "diversity_loss_mlp": 0.0, "epoch": 0.6490958060792612, "flos": 474412018176.0, "grad_norm": 0.07079984620053167, "language_loss": 0.87879932, "learning_rate": 0.00028961680029930526, "loss": 0.88939387, "num_input_tokens_seen": 280456944, "router_z_loss_mlp": 0.09362793, "routerloss_mlp": 0.0, "step": 3374, "time_per_iteration": 2.535357713699341 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058352, "balance_loss_mlp": 1.04902411, "diversity_loss_mlp": 0.0, "epoch": 0.6492881877645248, "flos": 588850518528.0, "grad_norm": 0.07847742657670442, "language_loss": 0.7705428, "learning_rate": 0.00028933421962350317, "loss": 0.78112632, "num_input_tokens_seen": 280534352, "router_z_loss_mlp": 0.09326172, "routerloss_mlp": 0.0, "step": 3375, "time_per_iteration": 2.7630350589752197 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059653, "balance_loss_mlp": 1.05022955, "diversity_loss_mlp": 0.0, "epoch": 0.6494805694497884, "flos": 642427905024.0, "grad_norm": 0.060066877370730534, "language_loss": 0.83867884, "learning_rate": 0.0002890517207354104, "loss": 0.84927535, "num_input_tokens_seen": 280608912, "router_z_loss_mlp": 0.09423828, "routerloss_mlp": 0.0, "step": 3376, "time_per_iteration": 2.8403854370117188 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067954, "balance_loss_mlp": 1.05819058, "diversity_loss_mlp": 0.0, "epoch": 0.649672951135052, "flos": 531806736384.0, "grad_norm": 0.07875615832785021, "language_loss": 0.81685328, "learning_rate": 0.0002887693037447029, "loss": 0.82753289, "num_input_tokens_seen": 280678848, "router_z_loss_mlp": 0.09753418, "routerloss_mlp": 0.0, "step": 3377, "time_per_iteration": 2.5936834812164307 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00786778, "balance_loss_mlp": 1.32879448, "diversity_loss_mlp": 0.22056285, "epoch": 0.6498653328203156, "flos": 547387725312.0, "grad_norm": 0.03360133181749734, "language_loss": 0.82620949, "learning_rate": 0.00028848696876102443, "loss": 0.8340773, "num_input_tokens_seen": 280750224, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01209909, "step": 3378, "time_per_iteration": 2.646881341934204 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01083646, "balance_loss_mlp": 1.07432425, "diversity_loss_mlp": 0.0, "epoch": 0.650057714505579, "flos": 462228415488.0, "grad_norm": 0.07289026043627096, "language_loss": 0.83464664, "learning_rate": 0.00028820471589398723, "loss": 0.84548312, "num_input_tokens_seen": 280817488, "router_z_loss_mlp": 0.09313965, "routerloss_mlp": 0.0, "step": 3379, "time_per_iteration": 2.5300872325897217 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0079061, "balance_loss_mlp": 1.3374207, "diversity_loss_mlp": 0.22020277, "epoch": 0.6502500961908426, "flos": 510172121088.0, "grad_norm": 0.03832598047329158, "language_loss": 0.78047603, "learning_rate": 0.00028792254525317196, "loss": 0.78838205, "num_input_tokens_seen": 280887440, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01179803, "step": 3380, "time_per_iteration": 2.696711301803589 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01090042, "balance_loss_mlp": 1.08066666, "diversity_loss_mlp": 0.0, "epoch": 0.6504424778761062, "flos": 579827165184.0, "grad_norm": 0.07654044550208572, "language_loss": 0.81385279, "learning_rate": 0.00028764045694812645, "loss": 0.82475317, "num_input_tokens_seen": 280959072, "router_z_loss_mlp": 0.09375, "routerloss_mlp": 0.0, "step": 3381, "time_per_iteration": 2.7730586528778076 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092437, "balance_loss_mlp": 1.08303761, "diversity_loss_mlp": 0.0, "epoch": 0.6506348595613698, "flos": 519457577472.0, "grad_norm": 0.08987457099582341, "language_loss": 0.76744068, "learning_rate": 0.0002873584510883671, "loss": 0.77836508, "num_input_tokens_seen": 281025376, "router_z_loss_mlp": 0.09387207, "routerloss_mlp": 0.0, "step": 3382, "time_per_iteration": 2.6443450450897217 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01088701, "balance_loss_mlp": 1.07926512, "diversity_loss_mlp": 0.0, "epoch": 0.6508272412466333, "flos": 510310513152.0, "grad_norm": 0.07067062397279458, "language_loss": 0.86143303, "learning_rate": 0.0002870765277833788, "loss": 0.87232006, "num_input_tokens_seen": 281097616, "router_z_loss_mlp": 0.09429932, "routerloss_mlp": 0.0, "step": 3383, "time_per_iteration": 2.740920305252075 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108089, "balance_loss_mlp": 1.07161593, "diversity_loss_mlp": 0.0, "epoch": 0.6510196229318969, "flos": 625623782400.0, "grad_norm": 0.07689735458190097, "language_loss": 0.80460048, "learning_rate": 0.00028679468714261347, "loss": 0.81540942, "num_input_tokens_seen": 281170192, "router_z_loss_mlp": 0.0927124, "routerloss_mlp": 0.0, "step": 3384, "time_per_iteration": 2.7767040729522705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074737, "balance_loss_mlp": 1.06546891, "diversity_loss_mlp": 0.0, "epoch": 0.6512120046171604, "flos": 474696142848.0, "grad_norm": 0.06416640561224615, "language_loss": 0.76925558, "learning_rate": 0.0002865129292754918, "loss": 0.78000295, "num_input_tokens_seen": 281238832, "router_z_loss_mlp": 0.09265137, "routerloss_mlp": 0.0, "step": 3385, "time_per_iteration": 2.591616630554199 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075196, "balance_loss_mlp": 1.06574309, "diversity_loss_mlp": 0.0, "epoch": 0.651404386302424, "flos": 551854798848.0, "grad_norm": 0.06819374320087251, "language_loss": 0.81950033, "learning_rate": 0.00028623125429140105, "loss": 0.83025235, "num_input_tokens_seen": 281319472, "router_z_loss_mlp": 0.09436035, "routerloss_mlp": 0.0, "step": 3386, "time_per_iteration": 2.819565773010254 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068167, "balance_loss_mlp": 1.05845094, "diversity_loss_mlp": 0.0, "epoch": 0.6515967679876876, "flos": 523311985152.0, "grad_norm": 0.07152430707450508, "language_loss": 0.8685019, "learning_rate": 0.00028594966229969785, "loss": 0.87918359, "num_input_tokens_seen": 281391168, "router_z_loss_mlp": 0.09716797, "routerloss_mlp": 0.0, "step": 3387, "time_per_iteration": 2.6802561283111572 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067324, "balance_loss_mlp": 1.05746567, "diversity_loss_mlp": 0.0, "epoch": 0.6517891496729511, "flos": 573874854912.0, "grad_norm": 0.0719578704836234, "language_loss": 0.81695348, "learning_rate": 0.00028566815340970577, "loss": 0.82762671, "num_input_tokens_seen": 281465664, "router_z_loss_mlp": 0.09857178, "routerloss_mlp": 0.0, "step": 3388, "time_per_iteration": 2.725184917449951 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0078869, "balance_loss_mlp": 1.33117235, "diversity_loss_mlp": 0.22285563, "epoch": 0.6519815313582147, "flos": 555926893056.0, "grad_norm": 0.03133119374313574, "language_loss": 0.80959165, "learning_rate": 0.0002853867277307162, "loss": 0.81747854, "num_input_tokens_seen": 281532928, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01167633, "step": 3389, "time_per_iteration": 2.6700825691223145 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066001, "balance_loss_mlp": 1.05601168, "diversity_loss_mlp": 0.0, "epoch": 0.6521739130434783, "flos": 480487666176.0, "grad_norm": 0.077177119922592, "language_loss": 0.82811326, "learning_rate": 0.00028510538537198824, "loss": 0.83877325, "num_input_tokens_seen": 281601680, "router_z_loss_mlp": 0.09985352, "routerloss_mlp": 0.0, "step": 3390, "time_per_iteration": 2.65598464012146 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065546, "balance_loss_mlp": 1.05591428, "diversity_loss_mlp": 0.0, "epoch": 0.6523662947287419, "flos": 665707797504.0, "grad_norm": 0.06292665593790116, "language_loss": 0.86663938, "learning_rate": 0.00028482412644274867, "loss": 0.87729478, "num_input_tokens_seen": 281679488, "router_z_loss_mlp": 0.09625244, "routerloss_mlp": 0.0, "step": 3391, "time_per_iteration": 2.926029682159424 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106361, "balance_loss_mlp": 1.05354261, "diversity_loss_mlp": 0.0, "epoch": 0.6525586764140053, "flos": 548655275520.0, "grad_norm": 0.07441000419261597, "language_loss": 0.74793214, "learning_rate": 0.00028454295105219207, "loss": 0.75856817, "num_input_tokens_seen": 281751056, "router_z_loss_mlp": 0.10064697, "routerloss_mlp": 0.0, "step": 3392, "time_per_iteration": 2.6511483192443848 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064701, "balance_loss_mlp": 1.05479479, "diversity_loss_mlp": 0.0, "epoch": 0.6527510580992689, "flos": 802900159488.0, "grad_norm": 0.053639196798002685, "language_loss": 0.79547405, "learning_rate": 0.0002842618593094802, "loss": 0.80612105, "num_input_tokens_seen": 281841008, "router_z_loss_mlp": 0.09899902, "routerloss_mlp": 0.0, "step": 3393, "time_per_iteration": 3.1180903911590576 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066549, "balance_loss_mlp": 1.05651164, "diversity_loss_mlp": 0.0, "epoch": 0.6529434397845325, "flos": 671166010368.0, "grad_norm": 0.09762000223606793, "language_loss": 0.80486917, "learning_rate": 0.00028398085132374243, "loss": 0.81553459, "num_input_tokens_seen": 281908016, "router_z_loss_mlp": 0.1003418, "routerloss_mlp": 0.0, "step": 3394, "time_per_iteration": 2.805560350418091 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061804, "balance_loss_mlp": 1.05185044, "diversity_loss_mlp": 0.0, "epoch": 0.6531358214697961, "flos": 828409006080.0, "grad_norm": 0.06212778963151281, "language_loss": 0.84015262, "learning_rate": 0.0002836999272040761, "loss": 0.85077065, "num_input_tokens_seen": 281989072, "router_z_loss_mlp": 0.0994873, "routerloss_mlp": 0.0, "step": 3395, "time_per_iteration": 3.1151998043060303 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062758, "balance_loss_mlp": 1.05245829, "diversity_loss_mlp": 0.0, "epoch": 0.6533282031550597, "flos": 487403578368.0, "grad_norm": 0.07524661860640132, "language_loss": 0.83834863, "learning_rate": 0.00028341908705954575, "loss": 0.84897625, "num_input_tokens_seen": 282053152, "router_z_loss_mlp": 0.10296631, "routerloss_mlp": 0.0, "step": 3396, "time_per_iteration": 2.5500996112823486 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00599946, "balance_loss_mlp": 1.02570343, "diversity_loss_mlp": 0.15256089, "epoch": 0.6535205848403232, "flos": 1557744638976.0, "grad_norm": 0.0014313680900061394, "language_loss": 0.81761813, "learning_rate": 0.00028313833099918265, "loss": 0.82361758, "num_input_tokens_seen": 282283984, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01081435, "step": 3397, "time_per_iteration": 4.838392496109009 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060865, "balance_loss_mlp": 1.05047619, "diversity_loss_mlp": 0.0, "epoch": 0.6537129665255867, "flos": 493711593984.0, "grad_norm": 0.08700190278237876, "language_loss": 0.77911532, "learning_rate": 0.00028285765913198604, "loss": 0.78972399, "num_input_tokens_seen": 282353008, "router_z_loss_mlp": 0.10394287, "routerloss_mlp": 0.0, "step": 3398, "time_per_iteration": 2.5510177612304688 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056044, "balance_loss_mlp": 1.04590559, "diversity_loss_mlp": 0.0, "epoch": 0.6539053482108503, "flos": 605002328064.0, "grad_norm": 0.06794032810044964, "language_loss": 0.82229477, "learning_rate": 0.0002825770715669227, "loss": 0.83285522, "num_input_tokens_seen": 282427648, "router_z_loss_mlp": 0.10137939, "routerloss_mlp": 0.0, "step": 3399, "time_per_iteration": 2.7065982818603516 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052667, "balance_loss_mlp": 1.04248071, "diversity_loss_mlp": 0.0, "epoch": 0.6540977298961139, "flos": 577778821632.0, "grad_norm": 0.06703848890261048, "language_loss": 0.81440985, "learning_rate": 0.00028229656841292634, "loss": 0.82493651, "num_input_tokens_seen": 282502128, "router_z_loss_mlp": 0.10186768, "routerloss_mlp": 0.0, "step": 3400, "time_per_iteration": 2.7117483615875244 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050894, "balance_loss_mlp": 1.04067171, "diversity_loss_mlp": 0.0, "epoch": 0.6542901115813774, "flos": 511753531392.0, "grad_norm": 0.06998039744710104, "language_loss": 0.76892245, "learning_rate": 0.0002820161497788979, "loss": 0.7794314, "num_input_tokens_seen": 282569360, "router_z_loss_mlp": 0.10217285, "routerloss_mlp": 0.0, "step": 3401, "time_per_iteration": 2.590047836303711 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049365, "balance_loss_mlp": 1.03930926, "diversity_loss_mlp": 0.0, "epoch": 0.654482493266641, "flos": 625495302144.0, "grad_norm": 0.06845614791056948, "language_loss": 0.86992002, "learning_rate": 0.00028173581577370545, "loss": 0.88041365, "num_input_tokens_seen": 282645472, "router_z_loss_mlp": 0.1005249, "routerloss_mlp": 0.0, "step": 3402, "time_per_iteration": 2.7577242851257324 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047368, "balance_loss_mlp": 1.03716338, "diversity_loss_mlp": 0.0, "epoch": 0.6546748749519046, "flos": 523981550592.0, "grad_norm": 0.059228402052172, "language_loss": 0.78973734, "learning_rate": 0.0002814555665061844, "loss": 0.80021101, "num_input_tokens_seen": 282717568, "router_z_loss_mlp": 0.10198975, "routerloss_mlp": 0.0, "step": 3403, "time_per_iteration": 2.731137752532959 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047431, "balance_loss_mlp": 1.0375247, "diversity_loss_mlp": 0.0, "epoch": 0.6548672566371682, "flos": 479210204160.0, "grad_norm": 0.07926071177251158, "language_loss": 0.77611935, "learning_rate": 0.00028117540208513715, "loss": 0.78659368, "num_input_tokens_seen": 282791408, "router_z_loss_mlp": 0.09899902, "routerloss_mlp": 0.0, "step": 3404, "time_per_iteration": 2.689107894897461 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0077145, "balance_loss_mlp": 1.2970531, "diversity_loss_mlp": 0.22200939, "epoch": 0.6550596383224317, "flos": 616012356096.0, "grad_norm": 0.029568297533915613, "language_loss": 0.85005927, "learning_rate": 0.00028089532261933313, "loss": 0.85777372, "num_input_tokens_seen": 282862992, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01191924, "step": 3405, "time_per_iteration": 2.7177927494049072 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105146, "balance_loss_mlp": 1.04141116, "diversity_loss_mlp": 0.0, "epoch": 0.6552520200076952, "flos": 488836684800.0, "grad_norm": 0.08876519929545809, "language_loss": 0.85989165, "learning_rate": 0.0002806153282175087, "loss": 0.87040627, "num_input_tokens_seen": 282930448, "router_z_loss_mlp": 0.10046387, "routerloss_mlp": 0.0, "step": 3406, "time_per_iteration": 2.5502045154571533 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053299, "balance_loss_mlp": 1.04348814, "diversity_loss_mlp": 0.0, "epoch": 0.6554444016929588, "flos": 687619196928.0, "grad_norm": 0.07350490516448754, "language_loss": 0.82776654, "learning_rate": 0.0002803354189883679, "loss": 0.83829957, "num_input_tokens_seen": 283010864, "router_z_loss_mlp": 0.09802246, "routerloss_mlp": 0.0, "step": 3407, "time_per_iteration": 2.8476340770721436 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054863, "balance_loss_mlp": 1.0448494, "diversity_loss_mlp": 0.0, "epoch": 0.6556367833782224, "flos": 543051330048.0, "grad_norm": 0.06617021222220203, "language_loss": 0.85199594, "learning_rate": 0.00028005559504058053, "loss": 0.86254454, "num_input_tokens_seen": 283082240, "router_z_loss_mlp": 0.10009766, "routerloss_mlp": 0.0, "step": 3408, "time_per_iteration": 2.701035261154175 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105219, "balance_loss_mlp": 1.04206932, "diversity_loss_mlp": 0.0, "epoch": 0.655829165063486, "flos": 673535554560.0, "grad_norm": 0.08388731304351217, "language_loss": 0.77208018, "learning_rate": 0.0002797758564827838, "loss": 0.78260207, "num_input_tokens_seen": 283156656, "router_z_loss_mlp": 0.10113525, "routerloss_mlp": 0.0, "step": 3409, "time_per_iteration": 2.8340024948120117 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058496, "balance_loss_mlp": 1.04903674, "diversity_loss_mlp": 0.0, "epoch": 0.6560215467487496, "flos": 531806736384.0, "grad_norm": 0.07006819638769121, "language_loss": 0.83542061, "learning_rate": 0.0002794962034235824, "loss": 0.84600556, "num_input_tokens_seen": 283223584, "router_z_loss_mlp": 0.09448242, "routerloss_mlp": 0.0, "step": 3410, "time_per_iteration": 2.634612798690796 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054024, "balance_loss_mlp": 1.04401076, "diversity_loss_mlp": 0.0, "epoch": 0.656213928434013, "flos": 591311467008.0, "grad_norm": 0.07454971523093613, "language_loss": 0.74929279, "learning_rate": 0.00027921663597154695, "loss": 0.75983304, "num_input_tokens_seen": 283297680, "router_z_loss_mlp": 0.10009766, "routerloss_mlp": 0.0, "step": 3411, "time_per_iteration": 2.736161708831787 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058952, "balance_loss_mlp": 1.04926038, "diversity_loss_mlp": 0.0, "epoch": 0.6564063101192766, "flos": 415786825728.0, "grad_norm": 0.08159088858174726, "language_loss": 0.81125355, "learning_rate": 0.00027893715423521525, "loss": 0.82184303, "num_input_tokens_seen": 283359744, "router_z_loss_mlp": 0.09686279, "routerloss_mlp": 0.0, "step": 3412, "time_per_iteration": 2.452563524246216 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00781164, "balance_loss_mlp": 1.31892097, "diversity_loss_mlp": 0.22038518, "epoch": 0.6565986918045402, "flos": 453321059328.0, "grad_norm": 0.03347946196666781, "language_loss": 0.8419345, "learning_rate": 0.00027865775832309163, "loss": 0.84974611, "num_input_tokens_seen": 283430688, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01151081, "step": 3413, "time_per_iteration": 2.6782755851745605 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068715, "balance_loss_mlp": 1.05899358, "diversity_loss_mlp": 0.0, "epoch": 0.6567910734898038, "flos": 547746001920.0, "grad_norm": 0.0675198993979362, "language_loss": 0.86263126, "learning_rate": 0.00027837844834364733, "loss": 0.87331843, "num_input_tokens_seen": 283498048, "router_z_loss_mlp": 0.09710693, "routerloss_mlp": 0.0, "step": 3414, "time_per_iteration": 2.63967227935791 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058125, "balance_loss_mlp": 1.04836726, "diversity_loss_mlp": 0.0, "epoch": 0.6569834551750673, "flos": 655518210048.0, "grad_norm": 0.06663266607359189, "language_loss": 0.8659035, "learning_rate": 0.00027809922440532, "loss": 0.87648469, "num_input_tokens_seen": 283573040, "router_z_loss_mlp": 0.09753418, "routerloss_mlp": 0.0, "step": 3415, "time_per_iteration": 2.816204786300659 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059729, "balance_loss_mlp": 1.05018628, "diversity_loss_mlp": 0.0, "epoch": 0.6571758368603309, "flos": 539681107968.0, "grad_norm": 0.06360594790571725, "language_loss": 0.81154943, "learning_rate": 0.00027782008661651406, "loss": 0.82214665, "num_input_tokens_seen": 283651696, "router_z_loss_mlp": 0.09533691, "routerloss_mlp": 0.0, "step": 3416, "time_per_iteration": 2.80657958984375 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059234, "balance_loss_mlp": 1.04937577, "diversity_loss_mlp": 0.0, "epoch": 0.6573682185455945, "flos": 497346117120.0, "grad_norm": 0.062003807204006764, "language_loss": 0.87255514, "learning_rate": 0.00027754103508560013, "loss": 0.88314748, "num_input_tokens_seen": 283721824, "router_z_loss_mlp": 0.09857178, "routerloss_mlp": 0.0, "step": 3417, "time_per_iteration": 2.648777723312378 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062014, "balance_loss_mlp": 1.05205965, "diversity_loss_mlp": 0.0, "epoch": 0.657560600230858, "flos": 447465295872.0, "grad_norm": 0.06781110485333444, "language_loss": 0.82382166, "learning_rate": 0.0002772620699209163, "loss": 0.83444178, "num_input_tokens_seen": 283786960, "router_z_loss_mlp": 0.0994873, "routerloss_mlp": 0.0, "step": 3418, "time_per_iteration": 2.566547155380249 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010597, "balance_loss_mlp": 1.0503962, "diversity_loss_mlp": 0.0, "epoch": 0.6577529819161216, "flos": 481940596224.0, "grad_norm": 0.0650517875970755, "language_loss": 0.79616904, "learning_rate": 0.0002769831912307658, "loss": 0.80676609, "num_input_tokens_seen": 283853808, "router_z_loss_mlp": 0.09301758, "routerloss_mlp": 0.0, "step": 3419, "time_per_iteration": 2.606062889099121 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061383, "balance_loss_mlp": 1.05156565, "diversity_loss_mlp": 0.0, "epoch": 0.6579453636013851, "flos": 530843134464.0, "grad_norm": 0.07306581186555239, "language_loss": 0.80279779, "learning_rate": 0.00027670439912341917, "loss": 0.81341165, "num_input_tokens_seen": 283920960, "router_z_loss_mlp": 0.0980835, "routerloss_mlp": 0.0, "step": 3420, "time_per_iteration": 2.616004228591919 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058734, "balance_loss_mlp": 1.04903078, "diversity_loss_mlp": 0.0, "epoch": 0.6581377452866487, "flos": 628037743104.0, "grad_norm": 0.07531365664549339, "language_loss": 0.83319843, "learning_rate": 0.0002764256937071129, "loss": 0.84378576, "num_input_tokens_seen": 283992416, "router_z_loss_mlp": 0.09692383, "routerloss_mlp": 0.0, "step": 3421, "time_per_iteration": 2.7864840030670166 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061647, "balance_loss_mlp": 1.05205703, "diversity_loss_mlp": 0.0, "epoch": 0.6583301269719123, "flos": 548618199552.0, "grad_norm": 0.06844647739450752, "language_loss": 0.87222612, "learning_rate": 0.00027614707509005036, "loss": 0.88284254, "num_input_tokens_seen": 284061760, "router_z_loss_mlp": 0.0958252, "routerloss_mlp": 0.0, "step": 3422, "time_per_iteration": 2.666473388671875 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058786, "balance_loss_mlp": 1.04912376, "diversity_loss_mlp": 0.0, "epoch": 0.6585225086571759, "flos": 427493583360.0, "grad_norm": 0.0762783210263198, "language_loss": 0.79373097, "learning_rate": 0.0002758685433804008, "loss": 0.8043189, "num_input_tokens_seen": 284124848, "router_z_loss_mlp": 0.09649658, "routerloss_mlp": 0.0, "step": 3423, "time_per_iteration": 2.4872303009033203 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056028, "balance_loss_mlp": 1.04637778, "diversity_loss_mlp": 0.0, "epoch": 0.6587148903424394, "flos": 859620542976.0, "grad_norm": 0.07259832833327884, "language_loss": 0.79187661, "learning_rate": 0.00027559009868630005, "loss": 0.80243689, "num_input_tokens_seen": 284206272, "router_z_loss_mlp": 0.09637451, "routerloss_mlp": 0.0, "step": 3424, "time_per_iteration": 3.1284892559051514 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063823, "balance_loss_mlp": 1.0545187, "diversity_loss_mlp": 0.0, "epoch": 0.6589072720277029, "flos": 805630551552.0, "grad_norm": 0.07475259244153008, "language_loss": 0.80332637, "learning_rate": 0.0002753117411158491, "loss": 0.81396455, "num_input_tokens_seen": 284293696, "router_z_loss_mlp": 0.09301758, "routerloss_mlp": 0.0, "step": 3425, "time_per_iteration": 3.024216651916504 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066892, "balance_loss_mlp": 1.05724216, "diversity_loss_mlp": 0.0, "epoch": 0.6590996537129665, "flos": 548618199552.0, "grad_norm": 0.06493586108743211, "language_loss": 0.89989424, "learning_rate": 0.0002750334707771168, "loss": 0.91056317, "num_input_tokens_seen": 284360192, "router_z_loss_mlp": 0.09637451, "routerloss_mlp": 0.0, "step": 3426, "time_per_iteration": 2.6436870098114014 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066532, "balance_loss_mlp": 1.0567987, "diversity_loss_mlp": 0.0, "epoch": 0.6592920353982301, "flos": 454166092800.0, "grad_norm": 0.06891806065084582, "language_loss": 0.81568319, "learning_rate": 0.0002747552877781369, "loss": 0.82634848, "num_input_tokens_seen": 284423680, "router_z_loss_mlp": 0.097229, "routerloss_mlp": 0.0, "step": 3427, "time_per_iteration": 2.484457015991211 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106612, "balance_loss_mlp": 1.05665517, "diversity_loss_mlp": 0.0, "epoch": 0.6594844170834937, "flos": 567174057984.0, "grad_norm": 0.06651025164376474, "language_loss": 0.81769067, "learning_rate": 0.0002744771922269097, "loss": 0.82835186, "num_input_tokens_seen": 284495712, "router_z_loss_mlp": 0.09466553, "routerloss_mlp": 0.0, "step": 3428, "time_per_iteration": 2.724034547805786 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073203, "balance_loss_mlp": 1.06395817, "diversity_loss_mlp": 0.0, "epoch": 0.6596767987687572, "flos": 1187911194624.0, "grad_norm": 0.08249136451092651, "language_loss": 0.81983304, "learning_rate": 0.0002741991842314015, "loss": 0.83056509, "num_input_tokens_seen": 284583440, "router_z_loss_mlp": 0.09240723, "routerloss_mlp": 0.0, "step": 3429, "time_per_iteration": 3.4791431427001953 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106958, "balance_loss_mlp": 1.06021035, "diversity_loss_mlp": 0.0, "epoch": 0.6598691804540208, "flos": 503491147776.0, "grad_norm": 0.09631718735244636, "language_loss": 0.85994452, "learning_rate": 0.0002739212638995445, "loss": 0.87064034, "num_input_tokens_seen": 284649168, "router_z_loss_mlp": 0.09362793, "routerloss_mlp": 0.0, "step": 3430, "time_per_iteration": 2.5809226036071777 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070566, "balance_loss_mlp": 1.06089258, "diversity_loss_mlp": 0.0, "epoch": 0.6600615621392844, "flos": 531337231872.0, "grad_norm": 0.07152811859744175, "language_loss": 0.83226836, "learning_rate": 0.00027364343133923696, "loss": 0.84297395, "num_input_tokens_seen": 284723136, "router_z_loss_mlp": 0.09667969, "routerloss_mlp": 0.0, "step": 3431, "time_per_iteration": 2.664724826812744 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072179, "balance_loss_mlp": 1.06281483, "diversity_loss_mlp": 0.0, "epoch": 0.6602539438245479, "flos": 565446915072.0, "grad_norm": 0.07076815482363777, "language_loss": 0.82710063, "learning_rate": 0.0002733656866583431, "loss": 0.83782238, "num_input_tokens_seen": 284792752, "router_z_loss_mlp": 0.09362793, "routerloss_mlp": 0.0, "step": 3432, "time_per_iteration": 2.6845815181732178 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075681, "balance_loss_mlp": 1.06614459, "diversity_loss_mlp": 0.0, "epoch": 0.6604463255098114, "flos": 857159594496.0, "grad_norm": 0.07348653509543634, "language_loss": 0.83014315, "learning_rate": 0.0002730880299646927, "loss": 0.84089994, "num_input_tokens_seen": 284871008, "router_z_loss_mlp": 0.09527588, "routerloss_mlp": 0.0, "step": 3433, "time_per_iteration": 3.09417462348938 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072804, "balance_loss_mlp": 1.06348789, "diversity_loss_mlp": 0.0, "epoch": 0.660638707195075, "flos": 674462080512.0, "grad_norm": 0.060523936244010056, "language_loss": 0.85307741, "learning_rate": 0.0002728104613660821, "loss": 0.86380541, "num_input_tokens_seen": 284945184, "router_z_loss_mlp": 0.09320068, "routerloss_mlp": 0.0, "step": 3434, "time_per_iteration": 2.844012498855591 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071618, "balance_loss_mlp": 1.06231332, "diversity_loss_mlp": 0.0, "epoch": 0.6608310888803386, "flos": 888961402368.0, "grad_norm": 0.06580511923703304, "language_loss": 0.83062303, "learning_rate": 0.0002725329809702729, "loss": 0.84133923, "num_input_tokens_seen": 285029296, "router_z_loss_mlp": 0.09295654, "routerloss_mlp": 0.0, "step": 3435, "time_per_iteration": 3.203927516937256 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070425, "balance_loss_mlp": 1.06119871, "diversity_loss_mlp": 0.0, "epoch": 0.6610234705656022, "flos": 1136347646976.0, "grad_norm": 0.07937285786961487, "language_loss": 0.76092625, "learning_rate": 0.0002722555888849921, "loss": 0.77163053, "num_input_tokens_seen": 285124720, "router_z_loss_mlp": 0.09228516, "routerloss_mlp": 0.0, "step": 3436, "time_per_iteration": 3.441042423248291 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071904, "balance_loss_mlp": 1.06265306, "diversity_loss_mlp": 0.0, "epoch": 0.6612158522508658, "flos": 468012598272.0, "grad_norm": 0.06477982340890849, "language_loss": 0.80420995, "learning_rate": 0.00027197828521793334, "loss": 0.81492901, "num_input_tokens_seen": 285191360, "router_z_loss_mlp": 0.0925293, "routerloss_mlp": 0.0, "step": 3437, "time_per_iteration": 2.508976697921753 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072662, "balance_loss_mlp": 1.0631609, "diversity_loss_mlp": 0.0, "epoch": 0.6614082339361292, "flos": 571653614592.0, "grad_norm": 0.05773126923802199, "language_loss": 0.85235512, "learning_rate": 0.0002717010700767552, "loss": 0.86308175, "num_input_tokens_seen": 285262624, "router_z_loss_mlp": 0.09484863, "routerloss_mlp": 0.0, "step": 3438, "time_per_iteration": 2.7343809604644775 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00788388, "balance_loss_mlp": 1.33122396, "diversity_loss_mlp": 0.22170436, "epoch": 0.6616006156213928, "flos": 498467934720.0, "grad_norm": 0.035967269047030424, "language_loss": 0.76073134, "learning_rate": 0.00027142394356908226, "loss": 0.76861525, "num_input_tokens_seen": 285328512, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01192367, "step": 3439, "time_per_iteration": 2.6098694801330566 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072149, "balance_loss_mlp": 1.06304741, "diversity_loss_mlp": 0.0, "epoch": 0.6617929973066564, "flos": 602420239872.0, "grad_norm": 0.07092995700037574, "language_loss": 0.84935868, "learning_rate": 0.00027114690580250456, "loss": 0.86008012, "num_input_tokens_seen": 285406128, "router_z_loss_mlp": 0.09100342, "routerloss_mlp": 0.0, "step": 3440, "time_per_iteration": 2.7477781772613525 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067851, "balance_loss_mlp": 1.05864227, "diversity_loss_mlp": 0.0, "epoch": 0.66198537899192, "flos": 522983443968.0, "grad_norm": 0.07606845250334485, "language_loss": 0.87084186, "learning_rate": 0.0002708699568845776, "loss": 0.88152039, "num_input_tokens_seen": 285474704, "router_z_loss_mlp": 0.09210205, "routerloss_mlp": 0.0, "step": 3441, "time_per_iteration": 2.6247143745422363 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068327, "balance_loss_mlp": 1.062343, "diversity_loss_mlp": 0.0, "epoch": 0.6621777606771835, "flos": 1566256642560.0, "grad_norm": 0.03817420207517821, "language_loss": 0.79287779, "learning_rate": 0.00027059309692282265, "loss": 0.80356109, "num_input_tokens_seen": 285698704, "router_z_loss_mlp": 0.05981445, "routerloss_mlp": 0.0, "step": 3442, "time_per_iteration": 4.9118194580078125 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070581, "balance_loss_mlp": 1.06144977, "diversity_loss_mlp": 0.0, "epoch": 0.6623701423624471, "flos": 526664954880.0, "grad_norm": 0.059711141008881904, "language_loss": 0.83110899, "learning_rate": 0.0002703163260247261, "loss": 0.84181482, "num_input_tokens_seen": 285767936, "router_z_loss_mlp": 0.09143066, "routerloss_mlp": 0.0, "step": 3443, "time_per_iteration": 2.6146388053894043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070764, "balance_loss_mlp": 1.06162047, "diversity_loss_mlp": 0.0, "epoch": 0.6625625240477107, "flos": 528179553792.0, "grad_norm": 0.07293118954211444, "language_loss": 0.81726909, "learning_rate": 0.0002700396442977399, "loss": 0.82797676, "num_input_tokens_seen": 285839456, "router_z_loss_mlp": 0.09143066, "routerloss_mlp": 0.0, "step": 3444, "time_per_iteration": 2.6122488975524902 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072126, "balance_loss_mlp": 1.06287587, "diversity_loss_mlp": 0.0, "epoch": 0.6627549057329742, "flos": 473122073088.0, "grad_norm": 0.06235524151571192, "language_loss": 0.84365332, "learning_rate": 0.0002697630518492817, "loss": 0.85437459, "num_input_tokens_seen": 285905904, "router_z_loss_mlp": 0.09246826, "routerloss_mlp": 0.0, "step": 3445, "time_per_iteration": 2.695577621459961 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074381, "balance_loss_mlp": 1.06496358, "diversity_loss_mlp": 0.0, "epoch": 0.6629472874182378, "flos": 527996745216.0, "grad_norm": 0.09449311389962292, "language_loss": 0.85555631, "learning_rate": 0.0002694865487867343, "loss": 0.86630011, "num_input_tokens_seen": 285975520, "router_z_loss_mlp": 0.09411621, "routerloss_mlp": 0.0, "step": 3446, "time_per_iteration": 2.643448829650879 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066192, "balance_loss_mlp": 1.0568881, "diversity_loss_mlp": 0.0, "epoch": 0.6631396691035013, "flos": 613200471552.0, "grad_norm": 0.06130478535455018, "language_loss": 0.84665477, "learning_rate": 0.0002692101352174453, "loss": 0.85731673, "num_input_tokens_seen": 286050320, "router_z_loss_mlp": 0.09301758, "routerloss_mlp": 0.0, "step": 3447, "time_per_iteration": 2.7684693336486816 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071985, "balance_loss_mlp": 1.06239462, "diversity_loss_mlp": 0.0, "epoch": 0.6633320507887649, "flos": 609318899712.0, "grad_norm": 0.0686574359328325, "language_loss": 0.84783942, "learning_rate": 0.00026893381124872787, "loss": 0.85855925, "num_input_tokens_seen": 286120672, "router_z_loss_mlp": 0.09576416, "routerloss_mlp": 0.0, "step": 3448, "time_per_iteration": 2.6856155395507812 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077767, "balance_loss_mlp": 1.06869519, "diversity_loss_mlp": 0.0, "epoch": 0.6635244324740285, "flos": 749700873216.0, "grad_norm": 0.07711664740076789, "language_loss": 0.80761468, "learning_rate": 0.00026865757698786097, "loss": 0.8183924, "num_input_tokens_seen": 286201152, "router_z_loss_mlp": 0.09063721, "routerloss_mlp": 0.0, "step": 3449, "time_per_iteration": 3.0219905376434326 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064699, "balance_loss_mlp": 1.05549026, "diversity_loss_mlp": 0.0, "epoch": 0.6637168141592921, "flos": 664526882304.0, "grad_norm": 0.07081100750222453, "language_loss": 0.81853712, "learning_rate": 0.000268381432542088, "loss": 0.82918411, "num_input_tokens_seen": 286274512, "router_z_loss_mlp": 0.09204102, "routerloss_mlp": 0.0, "step": 3450, "time_per_iteration": 2.7959303855895996 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063443, "balance_loss_mlp": 1.05394757, "diversity_loss_mlp": 0.0, "epoch": 0.6639091958445555, "flos": 606783799296.0, "grad_norm": 0.0764006206271421, "language_loss": 0.80043346, "learning_rate": 0.00026810537801861807, "loss": 0.81106788, "num_input_tokens_seen": 286349808, "router_z_loss_mlp": 0.09484863, "routerloss_mlp": 0.0, "step": 3451, "time_per_iteration": 2.7303504943847656 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058217, "balance_loss_mlp": 1.04875171, "diversity_loss_mlp": 0.0, "epoch": 0.6641015775298191, "flos": 476697498624.0, "grad_norm": 0.05834244489040309, "language_loss": 0.81090832, "learning_rate": 0.0002678294135246243, "loss": 0.82149041, "num_input_tokens_seen": 286422912, "router_z_loss_mlp": 0.09460449, "routerloss_mlp": 0.0, "step": 3452, "time_per_iteration": 2.733463764190674 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056165, "balance_loss_mlp": 1.04691422, "diversity_loss_mlp": 0.0, "epoch": 0.6642939592150827, "flos": 904115105280.0, "grad_norm": 0.07343702884431198, "language_loss": 0.86356318, "learning_rate": 0.0002675535391672463, "loss": 0.87412483, "num_input_tokens_seen": 286501072, "router_z_loss_mlp": 0.09246826, "routerloss_mlp": 0.0, "step": 3453, "time_per_iteration": 3.115978956222534 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00797636, "balance_loss_mlp": 1.35083306, "diversity_loss_mlp": 0.22054271, "epoch": 0.6644863409003463, "flos": 581808697344.0, "grad_norm": 0.028810841374919304, "language_loss": 0.86237454, "learning_rate": 0.0002672777550535877, "loss": 0.87035096, "num_input_tokens_seen": 286580480, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01194801, "step": 3454, "time_per_iteration": 2.793548822402954 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060819, "balance_loss_mlp": 1.05172312, "diversity_loss_mlp": 0.0, "epoch": 0.6646787225856099, "flos": 479002802688.0, "grad_norm": 0.0753840272591569, "language_loss": 0.85331321, "learning_rate": 0.00026700206129071747, "loss": 0.8639214, "num_input_tokens_seen": 286646208, "router_z_loss_mlp": 0.09088135, "routerloss_mlp": 0.0, "step": 3455, "time_per_iteration": 2.5915210247039795 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064823, "balance_loss_mlp": 1.05565548, "diversity_loss_mlp": 0.0, "epoch": 0.6648711042708734, "flos": 449906420736.0, "grad_norm": 0.07433202645873906, "language_loss": 0.89061069, "learning_rate": 0.00026672645798566925, "loss": 0.90125895, "num_input_tokens_seen": 286710624, "router_z_loss_mlp": 0.0916748, "routerloss_mlp": 0.0, "step": 3456, "time_per_iteration": 2.5754494667053223 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059485, "balance_loss_mlp": 1.05019283, "diversity_loss_mlp": 0.0, "epoch": 0.665063485956137, "flos": 858960516096.0, "grad_norm": 0.07294926148794169, "language_loss": 0.79539233, "learning_rate": 0.00026645094524544225, "loss": 0.80598718, "num_input_tokens_seen": 286799472, "router_z_loss_mlp": 0.09289551, "routerloss_mlp": 0.0, "step": 3457, "time_per_iteration": 3.2948148250579834 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056831, "balance_loss_mlp": 1.04734802, "diversity_loss_mlp": 0.0, "epoch": 0.6652558676414005, "flos": 604312939008.0, "grad_norm": 0.08386362480566827, "language_loss": 0.75221157, "learning_rate": 0.00026617552317699945, "loss": 0.76277989, "num_input_tokens_seen": 286874752, "router_z_loss_mlp": 0.09484863, "routerloss_mlp": 0.0, "step": 3458, "time_per_iteration": 2.789961576461792 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057714, "balance_loss_mlp": 1.04836822, "diversity_loss_mlp": 0.0, "epoch": 0.6654482493266641, "flos": 510394576896.0, "grad_norm": 0.09354786354914506, "language_loss": 0.87007248, "learning_rate": 0.0002659001918872693, "loss": 0.88064957, "num_input_tokens_seen": 286943312, "router_z_loss_mlp": 0.09350586, "routerloss_mlp": 0.0, "step": 3459, "time_per_iteration": 2.6320250034332275 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058346, "balance_loss_mlp": 1.04896998, "diversity_loss_mlp": 0.0, "epoch": 0.6656406310119277, "flos": 565605130752.0, "grad_norm": 0.06598239053228593, "language_loss": 0.80718446, "learning_rate": 0.0002656249514831449, "loss": 0.81776798, "num_input_tokens_seen": 287010000, "router_z_loss_mlp": 0.09381104, "routerloss_mlp": 0.0, "step": 3460, "time_per_iteration": 2.6485753059387207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063647, "balance_loss_mlp": 1.05442595, "diversity_loss_mlp": 0.0, "epoch": 0.6658330126971912, "flos": 1024298141184.0, "grad_norm": 0.05863451757746151, "language_loss": 0.87114978, "learning_rate": 0.00026534980207148416, "loss": 0.88178623, "num_input_tokens_seen": 287101456, "router_z_loss_mlp": 0.09216309, "routerloss_mlp": 0.0, "step": 3461, "time_per_iteration": 3.4618935585021973 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066457, "balance_loss_mlp": 1.05719471, "diversity_loss_mlp": 0.0, "epoch": 0.6660253943824548, "flos": 816823388160.0, "grad_norm": 0.07572861338992695, "language_loss": 0.73451698, "learning_rate": 0.0002650747437591097, "loss": 0.7451815, "num_input_tokens_seen": 287182848, "router_z_loss_mlp": 0.09259033, "routerloss_mlp": 0.0, "step": 3462, "time_per_iteration": 2.985516309738159 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01026805, "balance_loss_mlp": 1.02065372, "diversity_loss_mlp": 0.0, "epoch": 0.6662177760677184, "flos": 1496169169920.0, "grad_norm": 0.017950660829121307, "language_loss": 0.8187958, "learning_rate": 0.00026479977665280806, "loss": 0.82906377, "num_input_tokens_seen": 287417920, "router_z_loss_mlp": 0.06152344, "routerloss_mlp": 0.0, "step": 3463, "time_per_iteration": 5.041592359542847 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067724, "balance_loss_mlp": 1.05844963, "diversity_loss_mlp": 0.0, "epoch": 0.666410157752982, "flos": 500120925696.0, "grad_norm": 0.06793562911737132, "language_loss": 0.86417711, "learning_rate": 0.00026452490085933155, "loss": 0.87485433, "num_input_tokens_seen": 287483776, "router_z_loss_mlp": 0.0927124, "routerloss_mlp": 0.0, "step": 3464, "time_per_iteration": 2.5661425590515137 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069546, "balance_loss_mlp": 1.05994368, "diversity_loss_mlp": 0.0, "epoch": 0.6666025394382454, "flos": 481169714688.0, "grad_norm": 0.08819800975527838, "language_loss": 0.89818048, "learning_rate": 0.00026425011648539614, "loss": 0.90887594, "num_input_tokens_seen": 287548176, "router_z_loss_mlp": 0.09594727, "routerloss_mlp": 0.0, "step": 3465, "time_per_iteration": 2.5488314628601074 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065239, "balance_loss_mlp": 1.05584478, "diversity_loss_mlp": 0.0, "epoch": 0.666794921123509, "flos": 546653919744.0, "grad_norm": 0.06406494944770698, "language_loss": 0.82567346, "learning_rate": 0.00026397542363768267, "loss": 0.83632582, "num_input_tokens_seen": 287618496, "router_z_loss_mlp": 0.09393311, "routerloss_mlp": 0.0, "step": 3466, "time_per_iteration": 2.669250965118408 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00781407, "balance_loss_mlp": 1.32080197, "diversity_loss_mlp": 0.21862534, "epoch": 0.6669873028087726, "flos": 471988145664.0, "grad_norm": 0.03313864292511896, "language_loss": 0.8202821, "learning_rate": 0.0002637008224228362, "loss": 0.82809615, "num_input_tokens_seen": 287684032, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01169338, "step": 3467, "time_per_iteration": 2.572173833847046 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070583, "balance_loss_mlp": 1.06133246, "diversity_loss_mlp": 0.0, "epoch": 0.6671796844940362, "flos": 547395065856.0, "grad_norm": 0.05107139851875669, "language_loss": 0.8441903, "learning_rate": 0.00026342631294746653, "loss": 0.85489613, "num_input_tokens_seen": 287757680, "router_z_loss_mlp": 0.09246826, "routerloss_mlp": 0.0, "step": 3468, "time_per_iteration": 2.698885917663574 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072156, "balance_loss_mlp": 1.06254137, "diversity_loss_mlp": 0.0, "epoch": 0.6673720661792998, "flos": 1070317214208.0, "grad_norm": 0.05734496396036439, "language_loss": 0.80842233, "learning_rate": 0.0002631518953181476, "loss": 0.81914389, "num_input_tokens_seen": 287848992, "router_z_loss_mlp": 0.09619141, "routerloss_mlp": 0.0, "step": 3469, "time_per_iteration": 3.4733734130859375 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0101407, "balance_loss_mlp": 1.0077759, "diversity_loss_mlp": 0.0, "epoch": 0.6675644478645633, "flos": 1523790600192.0, "grad_norm": 0.015747171991140264, "language_loss": 0.76325285, "learning_rate": 0.000262877569641418, "loss": 0.77339357, "num_input_tokens_seen": 288085680, "router_z_loss_mlp": 0.06298828, "routerloss_mlp": 0.0, "step": 3470, "time_per_iteration": 4.929265737533569 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074034, "balance_loss_mlp": 1.06445539, "diversity_loss_mlp": 0.0, "epoch": 0.6677568295498268, "flos": 579696113664.0, "grad_norm": 0.060826323549746535, "language_loss": 0.80429429, "learning_rate": 0.00026260333602377985, "loss": 0.81503463, "num_input_tokens_seen": 288161568, "router_z_loss_mlp": 0.09570312, "routerloss_mlp": 0.0, "step": 3471, "time_per_iteration": 2.848822593688965 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076958, "balance_loss_mlp": 1.06758189, "diversity_loss_mlp": 0.0, "epoch": 0.6679492112350904, "flos": 383935458816.0, "grad_norm": 0.07184696149338711, "language_loss": 0.87395489, "learning_rate": 0.0002623291945717007, "loss": 0.88472444, "num_input_tokens_seen": 288224032, "router_z_loss_mlp": 0.09368896, "routerloss_mlp": 0.0, "step": 3472, "time_per_iteration": 2.500190019607544 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073902, "balance_loss_mlp": 1.06426954, "diversity_loss_mlp": 0.0, "epoch": 0.668141592920354, "flos": 1150759830528.0, "grad_norm": 0.06589735356893138, "language_loss": 0.84111875, "learning_rate": 0.00026205514539161175, "loss": 0.85185778, "num_input_tokens_seen": 288312912, "router_z_loss_mlp": 0.09625244, "routerloss_mlp": 0.0, "step": 3473, "time_per_iteration": 3.534797191619873 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072796, "balance_loss_mlp": 1.0632112, "diversity_loss_mlp": 0.0, "epoch": 0.6683339746056175, "flos": 561100608000.0, "grad_norm": 0.059882211902428664, "language_loss": 0.83973366, "learning_rate": 0.00026178118858990773, "loss": 0.8504616, "num_input_tokens_seen": 288394224, "router_z_loss_mlp": 0.09576416, "routerloss_mlp": 0.0, "step": 3474, "time_per_iteration": 2.8565967082977295 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070699, "balance_loss_mlp": 1.06109083, "diversity_loss_mlp": 0.0, "epoch": 0.6685263562908811, "flos": 514305884160.0, "grad_norm": 0.06021787961002869, "language_loss": 0.84205377, "learning_rate": 0.0002615073242729483, "loss": 0.85276067, "num_input_tokens_seen": 288462976, "router_z_loss_mlp": 0.0960083, "routerloss_mlp": 0.0, "step": 3475, "time_per_iteration": 2.678913116455078 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070157, "balance_loss_mlp": 1.0605185, "diversity_loss_mlp": 0.0, "epoch": 0.6687187379761447, "flos": 629772226560.0, "grad_norm": 0.05349171948445146, "language_loss": 0.84449661, "learning_rate": 0.0002612335525470573, "loss": 0.85519814, "num_input_tokens_seen": 288542032, "router_z_loss_mlp": 0.09625244, "routerloss_mlp": 0.0, "step": 3476, "time_per_iteration": 2.8754477500915527 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063847, "balance_loss_mlp": 1.05415487, "diversity_loss_mlp": 0.0, "epoch": 0.6689111196614083, "flos": 535586992128.0, "grad_norm": 0.0743507074362168, "language_loss": 0.78049976, "learning_rate": 0.0002609598735185221, "loss": 0.79113823, "num_input_tokens_seen": 288610704, "router_z_loss_mlp": 0.09686279, "routerloss_mlp": 0.0, "step": 3477, "time_per_iteration": 2.6721932888031006 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066424, "balance_loss_mlp": 1.05687511, "diversity_loss_mlp": 0.0, "epoch": 0.6691035013466718, "flos": 603038048256.0, "grad_norm": 0.06005632064488323, "language_loss": 0.83158946, "learning_rate": 0.00026068628729359445, "loss": 0.84225374, "num_input_tokens_seen": 288686080, "router_z_loss_mlp": 0.09545898, "routerloss_mlp": 0.0, "step": 3478, "time_per_iteration": 2.7650654315948486 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068151, "balance_loss_mlp": 1.05830431, "diversity_loss_mlp": 0.0, "epoch": 0.6692958830319353, "flos": 632855752704.0, "grad_norm": 0.0704650229723735, "language_loss": 0.76221395, "learning_rate": 0.00026041279397848996, "loss": 0.77289546, "num_input_tokens_seen": 288764944, "router_z_loss_mlp": 0.09844971, "routerloss_mlp": 0.0, "step": 3479, "time_per_iteration": 2.8531105518341064 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065245, "balance_loss_mlp": 1.055673, "diversity_loss_mlp": 0.0, "epoch": 0.6694882647171989, "flos": 645471783936.0, "grad_norm": 0.06824163679163787, "language_loss": 0.82570118, "learning_rate": 0.00026013939367938797, "loss": 0.8363536, "num_input_tokens_seen": 288847856, "router_z_loss_mlp": 0.09570312, "routerloss_mlp": 0.0, "step": 3480, "time_per_iteration": 2.8762619495391846 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00798551, "balance_loss_mlp": 1.35232079, "diversity_loss_mlp": 0.22152299, "epoch": 0.6696806464024625, "flos": 569585447424.0, "grad_norm": 0.028482542431452974, "language_loss": 0.81186199, "learning_rate": 0.00025986608650243204, "loss": 0.81984746, "num_input_tokens_seen": 288929360, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01162949, "step": 3481, "time_per_iteration": 2.8153860569000244 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071437, "balance_loss_mlp": 1.06166184, "diversity_loss_mlp": 0.0, "epoch": 0.6698730280877261, "flos": 622700669952.0, "grad_norm": 0.08903053329626802, "language_loss": 0.79281807, "learning_rate": 0.0002595928725537293, "loss": 0.80353248, "num_input_tokens_seen": 289010160, "router_z_loss_mlp": 0.09771729, "routerloss_mlp": 0.0, "step": 3482, "time_per_iteration": 2.8563952445983887 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064965, "balance_loss_mlp": 1.05542827, "diversity_loss_mlp": 0.0, "epoch": 0.6700654097729896, "flos": 502507722240.0, "grad_norm": 0.06597366352184171, "language_loss": 0.8811605, "learning_rate": 0.0002593197519393509, "loss": 0.89181018, "num_input_tokens_seen": 289077392, "router_z_loss_mlp": 0.09539795, "routerloss_mlp": 0.0, "step": 3483, "time_per_iteration": 2.659468650817871 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060552, "balance_loss_mlp": 1.05117035, "diversity_loss_mlp": 0.0, "epoch": 0.6702577914582531, "flos": 623876815872.0, "grad_norm": 0.06129183928704833, "language_loss": 0.79517573, "learning_rate": 0.00025904672476533165, "loss": 0.80578125, "num_input_tokens_seen": 289157248, "router_z_loss_mlp": 0.09375, "routerloss_mlp": 0.0, "step": 3484, "time_per_iteration": 2.843041181564331 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062734, "balance_loss_mlp": 1.0531497, "diversity_loss_mlp": 0.0, "epoch": 0.6704501731435167, "flos": 456268764672.0, "grad_norm": 0.06231151375576235, "language_loss": 0.82821012, "learning_rate": 0.0002587737911376704, "loss": 0.83883744, "num_input_tokens_seen": 289224864, "router_z_loss_mlp": 0.09570312, "routerloss_mlp": 0.0, "step": 3485, "time_per_iteration": 2.579852819442749 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065444, "balance_loss_mlp": 1.0560143, "diversity_loss_mlp": 0.0, "epoch": 0.6706425548287803, "flos": 543229369344.0, "grad_norm": 0.06196157664485949, "language_loss": 0.84223086, "learning_rate": 0.00025850095116232885, "loss": 0.85288531, "num_input_tokens_seen": 289293488, "router_z_loss_mlp": 0.09417725, "routerloss_mlp": 0.0, "step": 3486, "time_per_iteration": 2.6867549419403076 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059913, "balance_loss_mlp": 1.05029857, "diversity_loss_mlp": 0.0, "epoch": 0.6708349365140439, "flos": 633940494336.0, "grad_norm": 0.07455755751361211, "language_loss": 0.77796304, "learning_rate": 0.000258228204945233, "loss": 0.78856218, "num_input_tokens_seen": 289370560, "router_z_loss_mlp": 0.09613037, "routerloss_mlp": 0.0, "step": 3487, "time_per_iteration": 2.9104583263397217 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00788296, "balance_loss_mlp": 1.33072948, "diversity_loss_mlp": 0.22110668, "epoch": 0.6710273181993074, "flos": 640747749888.0, "grad_norm": 0.03107378418050736, "language_loss": 0.84813625, "learning_rate": 0.00025795555259227254, "loss": 0.8560192, "num_input_tokens_seen": 289440096, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0123779, "step": 3488, "time_per_iteration": 2.799049139022827 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064064, "balance_loss_mlp": 1.05453348, "diversity_loss_mlp": 0.0, "epoch": 0.671219699884571, "flos": 553942789632.0, "grad_norm": 0.05587900492957358, "language_loss": 0.8365714, "learning_rate": 0.00025768299420930046, "loss": 0.84721196, "num_input_tokens_seen": 289515808, "router_z_loss_mlp": 0.09515381, "routerloss_mlp": 0.0, "step": 3489, "time_per_iteration": 2.7350802421569824 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059988, "balance_loss_mlp": 1.05058801, "diversity_loss_mlp": 0.0, "epoch": 0.6714120815698346, "flos": 731508433920.0, "grad_norm": 0.0636982622522837, "language_loss": 0.83686626, "learning_rate": 0.0002574105299021332, "loss": 0.84746611, "num_input_tokens_seen": 289591344, "router_z_loss_mlp": 0.09387207, "routerloss_mlp": 0.0, "step": 3490, "time_per_iteration": 2.8952267169952393 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056705, "balance_loss_mlp": 1.04722226, "diversity_loss_mlp": 0.0, "epoch": 0.6716044632550981, "flos": 688664291328.0, "grad_norm": 0.059047086854658884, "language_loss": 0.84235394, "learning_rate": 0.00025713815977655084, "loss": 0.85292095, "num_input_tokens_seen": 289672032, "router_z_loss_mlp": 0.09472656, "routerloss_mlp": 0.0, "step": 3491, "time_per_iteration": 2.8801188468933105 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059823, "balance_loss_mlp": 1.05020285, "diversity_loss_mlp": 0.0, "epoch": 0.6717968449403616, "flos": 460629752832.0, "grad_norm": 0.0713613195550899, "language_loss": 0.84868813, "learning_rate": 0.0002568658839382969, "loss": 0.85928631, "num_input_tokens_seen": 289738304, "router_z_loss_mlp": 0.09613037, "routerloss_mlp": 0.0, "step": 3492, "time_per_iteration": 2.565765380859375 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055936, "balance_loss_mlp": 1.04666197, "diversity_loss_mlp": 0.0, "epoch": 0.6719892266256252, "flos": 501608360448.0, "grad_norm": 0.0809894292628365, "language_loss": 0.8436929, "learning_rate": 0.00025659370249307814, "loss": 0.85425228, "num_input_tokens_seen": 289804304, "router_z_loss_mlp": 0.0927124, "routerloss_mlp": 0.0, "step": 3493, "time_per_iteration": 2.61505126953125 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056736, "balance_loss_mlp": 1.04709792, "diversity_loss_mlp": 0.0, "epoch": 0.6721816083108888, "flos": 683525081088.0, "grad_norm": 0.06605957100839344, "language_loss": 0.85386133, "learning_rate": 0.00025632161554656473, "loss": 0.86442864, "num_input_tokens_seen": 289877696, "router_z_loss_mlp": 0.09631348, "routerloss_mlp": 0.0, "step": 3494, "time_per_iteration": 2.8639488220214844 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054512, "balance_loss_mlp": 1.04485643, "diversity_loss_mlp": 0.0, "epoch": 0.6723739899961524, "flos": 585813980160.0, "grad_norm": 0.0758709557174038, "language_loss": 0.8232398, "learning_rate": 0.00025604962320439017, "loss": 0.83378488, "num_input_tokens_seen": 289947296, "router_z_loss_mlp": 0.09643555, "routerloss_mlp": 0.0, "step": 3495, "time_per_iteration": 2.71235728263855 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056091, "balance_loss_mlp": 1.04692411, "diversity_loss_mlp": 0.0, "epoch": 0.672566371681416, "flos": 506616519168.0, "grad_norm": 0.06832671008161519, "language_loss": 0.82082075, "learning_rate": 0.0002557777255721516, "loss": 0.83138162, "num_input_tokens_seen": 290020080, "router_z_loss_mlp": 0.09161377, "routerloss_mlp": 0.0, "step": 3496, "time_per_iteration": 2.728652000427246 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052712, "balance_loss_mlp": 1.04334807, "diversity_loss_mlp": 0.0, "epoch": 0.6727587533666795, "flos": 535671055872.0, "grad_norm": 0.07590882568517338, "language_loss": 0.80502313, "learning_rate": 0.0002555059227554087, "loss": 0.81555027, "num_input_tokens_seen": 290094544, "router_z_loss_mlp": 0.09362793, "routerloss_mlp": 0.0, "step": 3497, "time_per_iteration": 2.6704843044281006 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054312, "balance_loss_mlp": 1.04488301, "diversity_loss_mlp": 0.0, "epoch": 0.672951135051943, "flos": 602832844800.0, "grad_norm": 0.0738650094824256, "language_loss": 0.77972269, "learning_rate": 0.00025523421485968453, "loss": 0.79026586, "num_input_tokens_seen": 290173520, "router_z_loss_mlp": 0.09429932, "routerloss_mlp": 0.0, "step": 3498, "time_per_iteration": 2.8093771934509277 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057909, "balance_loss_mlp": 1.04843736, "diversity_loss_mlp": 0.0, "epoch": 0.6731435167372066, "flos": 811315989504.0, "grad_norm": 0.07086262263525961, "language_loss": 0.85447127, "learning_rate": 0.00025496260199046585, "loss": 0.86505038, "num_input_tokens_seen": 290248240, "router_z_loss_mlp": 0.09466553, "routerloss_mlp": 0.0, "step": 3499, "time_per_iteration": 3.0010836124420166 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105668, "balance_loss_mlp": 1.04721487, "diversity_loss_mlp": 0.0, "epoch": 0.6733358984224702, "flos": 611594468352.0, "grad_norm": 0.056698795982303, "language_loss": 0.84606051, "learning_rate": 0.000254691084253202, "loss": 0.85662723, "num_input_tokens_seen": 290326288, "router_z_loss_mlp": 0.09466553, "routerloss_mlp": 0.0, "step": 3500, "time_per_iteration": 2.7931160926818848 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106411, "balance_loss_mlp": 1.05446577, "diversity_loss_mlp": 0.0, "epoch": 0.6735282801077337, "flos": 558901762560.0, "grad_norm": 0.075539637024569, "language_loss": 0.77243733, "learning_rate": 0.00025441966175330567, "loss": 0.78307843, "num_input_tokens_seen": 290395984, "router_z_loss_mlp": 0.09643555, "routerloss_mlp": 0.0, "step": 3501, "time_per_iteration": 2.6508493423461914 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067964, "balance_loss_mlp": 1.05850506, "diversity_loss_mlp": 0.0, "epoch": 0.6737206617929973, "flos": 672433560576.0, "grad_norm": 0.07065885937587965, "language_loss": 0.79737401, "learning_rate": 0.00025414833459615183, "loss": 0.80805361, "num_input_tokens_seen": 290470224, "router_z_loss_mlp": 0.09460449, "routerloss_mlp": 0.0, "step": 3502, "time_per_iteration": 2.784524917602539 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074197, "balance_loss_mlp": 1.0648514, "diversity_loss_mlp": 0.0, "epoch": 0.6739130434782609, "flos": 633446396928.0, "grad_norm": 0.06652503704287359, "language_loss": 0.80206275, "learning_rate": 0.0002538771028870796, "loss": 0.8128047, "num_input_tokens_seen": 290542864, "router_z_loss_mlp": 0.09338379, "routerloss_mlp": 0.0, "step": 3503, "time_per_iteration": 2.802136182785034 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075432, "balance_loss_mlp": 1.06571674, "diversity_loss_mlp": 0.0, "epoch": 0.6741054251635245, "flos": 531445888512.0, "grad_norm": 0.06376799007020843, "language_loss": 0.81455564, "learning_rate": 0.0002536059667313903, "loss": 0.82530999, "num_input_tokens_seen": 290617248, "router_z_loss_mlp": 0.09710693, "routerloss_mlp": 0.0, "step": 3504, "time_per_iteration": 2.711933135986328 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068251, "balance_loss_mlp": 1.05844057, "diversity_loss_mlp": 0.0, "epoch": 0.674297806848788, "flos": 542604220416.0, "grad_norm": 0.09964706429340704, "language_loss": 0.89608288, "learning_rate": 0.0002533349262343483, "loss": 0.9067654, "num_input_tokens_seen": 290690112, "router_z_loss_mlp": 0.09802246, "routerloss_mlp": 0.0, "step": 3505, "time_per_iteration": 2.6715004444122314 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082337, "balance_loss_mlp": 1.07268143, "diversity_loss_mlp": 0.0, "epoch": 0.6744901885340515, "flos": 463523129856.0, "grad_norm": 0.06572677444304757, "language_loss": 0.81604284, "learning_rate": 0.0002530639815011807, "loss": 0.82686627, "num_input_tokens_seen": 290756352, "router_z_loss_mlp": 0.09649658, "routerloss_mlp": 0.0, "step": 3506, "time_per_iteration": 2.4929287433624268 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0078955, "balance_loss_mlp": 1.33325195, "diversity_loss_mlp": 0.2229899, "epoch": 0.6746825702193151, "flos": 631830481920.0, "grad_norm": 0.03439328096706921, "language_loss": 0.8506915, "learning_rate": 0.0002527931326370781, "loss": 0.85858697, "num_input_tokens_seen": 290829776, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01142928, "step": 3507, "time_per_iteration": 2.83644962310791 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084193, "balance_loss_mlp": 1.07446539, "diversity_loss_mlp": 0.0, "epoch": 0.6748749519045787, "flos": 671146186752.0, "grad_norm": 0.08750505461607005, "language_loss": 0.82915336, "learning_rate": 0.00025252237974719276, "loss": 0.83999527, "num_input_tokens_seen": 290900736, "router_z_loss_mlp": 0.09716797, "routerloss_mlp": 0.0, "step": 3508, "time_per_iteration": 2.871253252029419 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081215, "balance_loss_mlp": 1.07155883, "diversity_loss_mlp": 0.0, "epoch": 0.6750673335898423, "flos": 767102980608.0, "grad_norm": 0.08335060522291943, "language_loss": 0.80458963, "learning_rate": 0.00025225172293664056, "loss": 0.81540173, "num_input_tokens_seen": 290981696, "router_z_loss_mlp": 0.09643555, "routerloss_mlp": 0.0, "step": 3509, "time_per_iteration": 3.033853530883789 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01014527, "balance_loss_mlp": 1.00832772, "diversity_loss_mlp": 0.0, "epoch": 0.6752597152751059, "flos": 1512607675392.0, "grad_norm": 0.01800991302482, "language_loss": 0.76933134, "learning_rate": 0.00025198116231049954, "loss": 0.77947664, "num_input_tokens_seen": 291217888, "router_z_loss_mlp": 0.06176758, "routerloss_mlp": 0.0, "step": 3510, "time_per_iteration": 4.911616325378418 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085264, "balance_loss_mlp": 1.07521439, "diversity_loss_mlp": 0.0, "epoch": 0.6754520969603693, "flos": 687297996288.0, "grad_norm": 0.09401749664970258, "language_loss": 0.84862983, "learning_rate": 0.00025171069797381106, "loss": 0.85948253, "num_input_tokens_seen": 291287856, "router_z_loss_mlp": 0.10046387, "routerloss_mlp": 0.0, "step": 3511, "time_per_iteration": 2.8283350467681885 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071317, "balance_loss_mlp": 1.06139874, "diversity_loss_mlp": 0.0, "epoch": 0.6756444786456329, "flos": 500577947136.0, "grad_norm": 0.06520954806538445, "language_loss": 0.82273233, "learning_rate": 0.00025144033003157864, "loss": 0.83344549, "num_input_tokens_seen": 291354912, "router_z_loss_mlp": 0.09912109, "routerloss_mlp": 0.0, "step": 3512, "time_per_iteration": 2.5983166694641113 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070965, "balance_loss_mlp": 1.06117821, "diversity_loss_mlp": 0.0, "epoch": 0.6758368603308965, "flos": 492616940544.0, "grad_norm": 0.08310754245868612, "language_loss": 0.78935671, "learning_rate": 0.00025117005858876806, "loss": 0.80006635, "num_input_tokens_seen": 291426816, "router_z_loss_mlp": 0.09777832, "routerloss_mlp": 0.0, "step": 3513, "time_per_iteration": 2.6797635555267334 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00787238, "balance_loss_mlp": 1.33182001, "diversity_loss_mlp": 0.21994653, "epoch": 0.6760292420161601, "flos": 555934233600.0, "grad_norm": 0.03353723121835004, "language_loss": 0.85560071, "learning_rate": 0.000250899883750308, "loss": 0.86347306, "num_input_tokens_seen": 291497648, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0113544, "step": 3514, "time_per_iteration": 2.7176060676574707 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059441, "balance_loss_mlp": 1.04921913, "diversity_loss_mlp": 0.0, "epoch": 0.6762216237014236, "flos": 607601668608.0, "grad_norm": 0.07453608092591449, "language_loss": 0.81898236, "learning_rate": 0.00025062980562109006, "loss": 0.82957679, "num_input_tokens_seen": 291568080, "router_z_loss_mlp": 0.10223389, "routerloss_mlp": 0.0, "step": 3515, "time_per_iteration": 2.7594966888427734 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00789958, "balance_loss_mlp": 1.33716106, "diversity_loss_mlp": 0.21975538, "epoch": 0.6764140053866872, "flos": 533785697280.0, "grad_norm": 0.033729691487123833, "language_loss": 0.83036506, "learning_rate": 0.0002503598243059677, "loss": 0.83826458, "num_input_tokens_seen": 291644896, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01150025, "step": 3516, "time_per_iteration": 2.891763687133789 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058281, "balance_loss_mlp": 1.04839277, "diversity_loss_mlp": 0.0, "epoch": 0.6766063870719508, "flos": 504810455040.0, "grad_norm": 0.07017833187059877, "language_loss": 0.80408925, "learning_rate": 0.0002500899399097568, "loss": 0.81467211, "num_input_tokens_seen": 291716864, "router_z_loss_mlp": 0.09887695, "routerloss_mlp": 0.0, "step": 3517, "time_per_iteration": 2.672029972076416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00786476, "balance_loss_mlp": 1.32907259, "diversity_loss_mlp": 0.22110882, "epoch": 0.6767987687572143, "flos": 513176726016.0, "grad_norm": 0.038425556988831724, "language_loss": 0.85818875, "learning_rate": 0.0002498201525372359, "loss": 0.86605346, "num_input_tokens_seen": 291786000, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01138566, "step": 3518, "time_per_iteration": 2.617760419845581 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054389, "balance_loss_mlp": 1.04459572, "diversity_loss_mlp": 0.0, "epoch": 0.6769911504424779, "flos": 525039128064.0, "grad_norm": 0.06814874892769256, "language_loss": 0.83201683, "learning_rate": 0.00024955046229314584, "loss": 0.84256077, "num_input_tokens_seen": 291854768, "router_z_loss_mlp": 0.09783936, "routerloss_mlp": 0.0, "step": 3519, "time_per_iteration": 2.6269547939300537 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051557, "balance_loss_mlp": 1.04138207, "diversity_loss_mlp": 0.0, "epoch": 0.6771835321277414, "flos": 449896508928.0, "grad_norm": 0.06326657634867637, "language_loss": 0.87517166, "learning_rate": 0.00024928086928218947, "loss": 0.88568723, "num_input_tokens_seen": 291918096, "router_z_loss_mlp": 0.10174561, "routerloss_mlp": 0.0, "step": 3520, "time_per_iteration": 2.500542163848877 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057369, "balance_loss_mlp": 1.04749823, "diversity_loss_mlp": 0.0, "epoch": 0.677375913813005, "flos": 709349985792.0, "grad_norm": 0.0729210521666428, "language_loss": 0.76251125, "learning_rate": 0.00024901137360903216, "loss": 0.77308488, "num_input_tokens_seen": 291998752, "router_z_loss_mlp": 0.09869385, "routerloss_mlp": 0.0, "step": 3521, "time_per_iteration": 2.921558380126953 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055481, "balance_loss_mlp": 1.04586673, "diversity_loss_mlp": 0.0, "epoch": 0.6775682954982686, "flos": 428420109312.0, "grad_norm": 0.08065371435227142, "language_loss": 0.80853164, "learning_rate": 0.00024874197537830115, "loss": 0.81908649, "num_input_tokens_seen": 292065056, "router_z_loss_mlp": 0.09613037, "routerloss_mlp": 0.0, "step": 3522, "time_per_iteration": 2.5280978679656982 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00793286, "balance_loss_mlp": 1.3416667, "diversity_loss_mlp": 0.22178407, "epoch": 0.6777606771835322, "flos": 437905626624.0, "grad_norm": 0.034341347950706966, "language_loss": 0.834656, "learning_rate": 0.00024847267469458684, "loss": 0.8425889, "num_input_tokens_seen": 292129248, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0115611, "step": 3523, "time_per_iteration": 2.5251760482788086 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058563, "balance_loss_mlp": 1.04881763, "diversity_loss_mlp": 0.0, "epoch": 0.6779530588687956, "flos": 775442087424.0, "grad_norm": 0.0593554156839795, "language_loss": 0.77790511, "learning_rate": 0.00024820347166244034, "loss": 0.78849077, "num_input_tokens_seen": 292206080, "router_z_loss_mlp": 0.09741211, "routerloss_mlp": 0.0, "step": 3524, "time_per_iteration": 2.9970362186431885 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061163, "balance_loss_mlp": 1.051489, "diversity_loss_mlp": 0.0, "epoch": 0.6781454405540592, "flos": 571782094848.0, "grad_norm": 0.05785383684082485, "language_loss": 0.8476572, "learning_rate": 0.0002479343663863755, "loss": 0.85826874, "num_input_tokens_seen": 292280192, "router_z_loss_mlp": 0.09674072, "routerloss_mlp": 0.0, "step": 3525, "time_per_iteration": 2.748159885406494 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059476, "balance_loss_mlp": 1.04968917, "diversity_loss_mlp": 0.0, "epoch": 0.6783378222393228, "flos": 485026693632.0, "grad_norm": 0.0719627260838572, "language_loss": 0.76970756, "learning_rate": 0.00024766535897086876, "loss": 0.78030241, "num_input_tokens_seen": 292347792, "router_z_loss_mlp": 0.09777832, "routerloss_mlp": 0.0, "step": 3526, "time_per_iteration": 2.5848824977874756 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060587, "balance_loss_mlp": 1.05073428, "diversity_loss_mlp": 0.0, "epoch": 0.6785302039245864, "flos": 482839958016.0, "grad_norm": 0.06835251841322831, "language_loss": 0.79290187, "learning_rate": 0.0002473964495203578, "loss": 0.80350775, "num_input_tokens_seen": 292420032, "router_z_loss_mlp": 0.09851074, "routerloss_mlp": 0.0, "step": 3527, "time_per_iteration": 2.6953914165496826 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106164, "balance_loss_mlp": 1.05191827, "diversity_loss_mlp": 0.0, "epoch": 0.67872258560985, "flos": 524732608512.0, "grad_norm": 0.06684083470405644, "language_loss": 0.85681713, "learning_rate": 0.0002471276381392425, "loss": 0.86743355, "num_input_tokens_seen": 292497792, "router_z_loss_mlp": 0.09710693, "routerloss_mlp": 0.0, "step": 3528, "time_per_iteration": 2.7917094230651855 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01030948, "balance_loss_mlp": 1.02451074, "diversity_loss_mlp": 0.0, "epoch": 0.6789149672951135, "flos": 1552605428736.0, "grad_norm": 0.029269024795112553, "language_loss": 0.78188634, "learning_rate": 0.0002468589249318848, "loss": 0.7921958, "num_input_tokens_seen": 292726704, "router_z_loss_mlp": 0.06445312, "routerloss_mlp": 0.0, "step": 3529, "time_per_iteration": 4.962055921554565 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066825, "balance_loss_mlp": 1.05733609, "diversity_loss_mlp": 0.0, "epoch": 0.6791073489803771, "flos": 741406556160.0, "grad_norm": 0.06831388456608918, "language_loss": 0.84243917, "learning_rate": 0.00024659031000260826, "loss": 0.85310745, "num_input_tokens_seen": 292802320, "router_z_loss_mlp": 0.09490967, "routerloss_mlp": 0.0, "step": 3530, "time_per_iteration": 2.8746378421783447 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066552, "balance_loss_mlp": 1.05688381, "diversity_loss_mlp": 0.0, "epoch": 0.6792997306656406, "flos": 576365538816.0, "grad_norm": 0.07285232550578888, "language_loss": 0.80730051, "learning_rate": 0.0002463217934556985, "loss": 0.81796598, "num_input_tokens_seen": 292870480, "router_z_loss_mlp": 0.09661865, "routerloss_mlp": 0.0, "step": 3531, "time_per_iteration": 2.7028424739837646 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01014286, "balance_loss_mlp": 1.00808728, "diversity_loss_mlp": 0.0, "epoch": 0.6794921123509042, "flos": 1503337273344.0, "grad_norm": 0.01858574921496822, "language_loss": 0.7653209, "learning_rate": 0.000246053375395403, "loss": 0.77546376, "num_input_tokens_seen": 293100752, "router_z_loss_mlp": 0.06201172, "routerloss_mlp": 0.0, "step": 3532, "time_per_iteration": 4.780252933502197 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071665, "balance_loss_mlp": 1.06221724, "diversity_loss_mlp": 0.0, "epoch": 0.6796844940361677, "flos": 698923261440.0, "grad_norm": 0.08979673870599186, "language_loss": 0.83808529, "learning_rate": 0.0002457850559259306, "loss": 0.84880191, "num_input_tokens_seen": 293178192, "router_z_loss_mlp": 0.09442139, "routerloss_mlp": 0.0, "step": 3533, "time_per_iteration": 2.9009928703308105 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107616, "balance_loss_mlp": 1.06684947, "diversity_loss_mlp": 0.0, "epoch": 0.6798768757214313, "flos": 552759303168.0, "grad_norm": 0.06667977411786664, "language_loss": 0.81866515, "learning_rate": 0.00024551683515145275, "loss": 0.82942677, "num_input_tokens_seen": 293246368, "router_z_loss_mlp": 0.09307861, "routerloss_mlp": 0.0, "step": 3534, "time_per_iteration": 2.67411208152771 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076962, "balance_loss_mlp": 1.0675205, "diversity_loss_mlp": 0.0, "epoch": 0.6800692574066949, "flos": 522936456192.0, "grad_norm": 0.06662082176408471, "language_loss": 0.86499625, "learning_rate": 0.0002452487131761014, "loss": 0.87576586, "num_input_tokens_seen": 293320656, "router_z_loss_mlp": 0.09429932, "routerloss_mlp": 0.0, "step": 3535, "time_per_iteration": 2.723414421081543 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071026, "balance_loss_mlp": 1.06126261, "diversity_loss_mlp": 0.0, "epoch": 0.6802616390919585, "flos": 574023158784.0, "grad_norm": 0.07513209939898634, "language_loss": 0.79904449, "learning_rate": 0.00024498069010397093, "loss": 0.80975473, "num_input_tokens_seen": 293388592, "router_z_loss_mlp": 0.09753418, "routerloss_mlp": 0.0, "step": 3536, "time_per_iteration": 2.729044198989868 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071543, "balance_loss_mlp": 1.06177378, "diversity_loss_mlp": 0.0, "epoch": 0.6804540207772221, "flos": 488157207552.0, "grad_norm": 0.062001089349607685, "language_loss": 0.85142958, "learning_rate": 0.00024471276603911697, "loss": 0.86214507, "num_input_tokens_seen": 293453936, "router_z_loss_mlp": 0.09759521, "routerloss_mlp": 0.0, "step": 3537, "time_per_iteration": 4.243680953979492 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073179, "balance_loss_mlp": 1.06360102, "diversity_loss_mlp": 0.0, "epoch": 0.6806464024624855, "flos": 578594119680.0, "grad_norm": 0.06230124795461592, "language_loss": 0.79373354, "learning_rate": 0.0002444449410855572, "loss": 0.80446529, "num_input_tokens_seen": 293527664, "router_z_loss_mlp": 0.09570312, "routerloss_mlp": 0.0, "step": 3538, "time_per_iteration": 2.744311571121216 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071763, "balance_loss_mlp": 1.06218505, "diversity_loss_mlp": 0.0, "epoch": 0.6808387841477491, "flos": 553722905088.0, "grad_norm": 0.057428584707934646, "language_loss": 0.84307408, "learning_rate": 0.00024417721534727033, "loss": 0.85379171, "num_input_tokens_seen": 293599344, "router_z_loss_mlp": 0.09570312, "routerloss_mlp": 0.0, "step": 3539, "time_per_iteration": 2.643796920776367 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073723, "balance_loss_mlp": 1.06420994, "diversity_loss_mlp": 0.0, "epoch": 0.6810311658330127, "flos": 426841270272.0, "grad_norm": 0.09448746877359589, "language_loss": 0.82968056, "learning_rate": 0.00024390958892819687, "loss": 0.8404178, "num_input_tokens_seen": 293663088, "router_z_loss_mlp": 0.09509277, "routerloss_mlp": 0.0, "step": 3540, "time_per_iteration": 2.500807285308838 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010722, "balance_loss_mlp": 1.0624193, "diversity_loss_mlp": 0.0, "epoch": 0.6812235475182763, "flos": 572256368640.0, "grad_norm": 0.06494427347835982, "language_loss": 0.80941665, "learning_rate": 0.0002436420619322381, "loss": 0.82013869, "num_input_tokens_seen": 293741296, "router_z_loss_mlp": 0.09771729, "routerloss_mlp": 0.0, "step": 3541, "time_per_iteration": 2.8345742225646973 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077361, "balance_loss_mlp": 1.0675267, "diversity_loss_mlp": 0.0, "epoch": 0.6814159292035398, "flos": 501917078016.0, "grad_norm": 0.07816741001086884, "language_loss": 0.82754946, "learning_rate": 0.0002433746344632577, "loss": 0.83832312, "num_input_tokens_seen": 293815840, "router_z_loss_mlp": 0.0982666, "routerloss_mlp": 0.0, "step": 3542, "time_per_iteration": 2.6863982677459717 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067339, "balance_loss_mlp": 1.05741465, "diversity_loss_mlp": 0.0, "epoch": 0.6816083108888034, "flos": 765531482112.0, "grad_norm": 0.06517118266272649, "language_loss": 0.80166835, "learning_rate": 0.00024310730662508006, "loss": 0.81234175, "num_input_tokens_seen": 293896368, "router_z_loss_mlp": 0.09924316, "routerloss_mlp": 0.0, "step": 3543, "time_per_iteration": 3.0644540786743164 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070309, "balance_loss_mlp": 1.06105816, "diversity_loss_mlp": 0.0, "epoch": 0.681800692574067, "flos": 479459824128.0, "grad_norm": 0.06994305910782121, "language_loss": 0.87753445, "learning_rate": 0.0002428400785214911, "loss": 0.88823748, "num_input_tokens_seen": 293963344, "router_z_loss_mlp": 0.09240723, "routerloss_mlp": 0.0, "step": 3544, "time_per_iteration": 2.5769219398498535 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070709, "balance_loss_mlp": 1.06136894, "diversity_loss_mlp": 0.0, "epoch": 0.6819930742593305, "flos": 691604656128.0, "grad_norm": 0.07082765333867001, "language_loss": 0.82354796, "learning_rate": 0.00024257295025623794, "loss": 0.83425504, "num_input_tokens_seen": 294035440, "router_z_loss_mlp": 0.09338379, "routerloss_mlp": 0.0, "step": 3545, "time_per_iteration": 2.799276828765869 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066867, "balance_loss_mlp": 1.05750871, "diversity_loss_mlp": 0.0, "epoch": 0.6821854559445941, "flos": 678096603648.0, "grad_norm": 0.06649234916050309, "language_loss": 0.8049404, "learning_rate": 0.00024230592193302892, "loss": 0.8156091, "num_input_tokens_seen": 294116944, "router_z_loss_mlp": 0.09350586, "routerloss_mlp": 0.0, "step": 3546, "time_per_iteration": 2.9205825328826904 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064713, "balance_loss_mlp": 1.05521762, "diversity_loss_mlp": 0.0, "epoch": 0.6823778376298576, "flos": 462191339520.0, "grad_norm": 0.07288649013986744, "language_loss": 0.84268177, "learning_rate": 0.00024203899365553372, "loss": 0.85332888, "num_input_tokens_seen": 294178976, "router_z_loss_mlp": 0.09490967, "routerloss_mlp": 0.0, "step": 3547, "time_per_iteration": 2.5345499515533447 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01028061, "balance_loss_mlp": 1.02241051, "diversity_loss_mlp": 0.0, "epoch": 0.6825702193151212, "flos": 1475298842112.0, "grad_norm": 0.024887330229706912, "language_loss": 0.76734358, "learning_rate": 0.00024177216552738302, "loss": 0.77762419, "num_input_tokens_seen": 294384960, "router_z_loss_mlp": 0.05639648, "routerloss_mlp": 0.0, "step": 3548, "time_per_iteration": 4.575555801391602 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066126, "balance_loss_mlp": 1.05700111, "diversity_loss_mlp": 0.0, "epoch": 0.6827626010003848, "flos": 723114998784.0, "grad_norm": 0.06418703018565212, "language_loss": 0.83182037, "learning_rate": 0.00024150543765216848, "loss": 0.84248167, "num_input_tokens_seen": 294461408, "router_z_loss_mlp": 0.09118652, "routerloss_mlp": 0.0, "step": 3549, "time_per_iteration": 2.9021003246307373 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060662, "balance_loss_mlp": 1.05113733, "diversity_loss_mlp": 0.0, "epoch": 0.6829549826856484, "flos": 558864686592.0, "grad_norm": 0.07049185581954354, "language_loss": 0.83715057, "learning_rate": 0.00024123881013344352, "loss": 0.8477571, "num_input_tokens_seen": 294530624, "router_z_loss_mlp": 0.09515381, "routerloss_mlp": 0.0, "step": 3550, "time_per_iteration": 2.671104669570923 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062514, "balance_loss_mlp": 1.05335271, "diversity_loss_mlp": 0.0, "epoch": 0.6831473643709118, "flos": 624934393344.0, "grad_norm": 0.06503037380674516, "language_loss": 0.7999897, "learning_rate": 0.00024097228307472202, "loss": 0.81061488, "num_input_tokens_seen": 294606784, "router_z_loss_mlp": 0.09161377, "routerloss_mlp": 0.0, "step": 3551, "time_per_iteration": 2.826650619506836 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064244, "balance_loss_mlp": 1.05474889, "diversity_loss_mlp": 0.0, "epoch": 0.6833397460561754, "flos": 713861849088.0, "grad_norm": 0.06680109192015529, "language_loss": 0.82289582, "learning_rate": 0.00024070585657947846, "loss": 0.83353829, "num_input_tokens_seen": 294686960, "router_z_loss_mlp": 0.09484863, "routerloss_mlp": 0.0, "step": 3552, "time_per_iteration": 2.831995725631714 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010645, "balance_loss_mlp": 1.05527949, "diversity_loss_mlp": 0.0, "epoch": 0.683532127741439, "flos": 464704045056.0, "grad_norm": 0.065434895685697, "language_loss": 0.85023475, "learning_rate": 0.00024043953075114934, "loss": 0.86087978, "num_input_tokens_seen": 294759712, "router_z_loss_mlp": 0.09210205, "routerloss_mlp": 0.0, "step": 3553, "time_per_iteration": 2.622846841812134 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055947, "balance_loss_mlp": 1.0463928, "diversity_loss_mlp": 0.0, "epoch": 0.6837245094267026, "flos": 582251037696.0, "grad_norm": 0.07243414619593286, "language_loss": 0.89257199, "learning_rate": 0.00024017330569313128, "loss": 0.90313148, "num_input_tokens_seen": 294830592, "router_z_loss_mlp": 0.09545898, "routerloss_mlp": 0.0, "step": 3554, "time_per_iteration": 2.705098867416382 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065851, "balance_loss_mlp": 1.05631375, "diversity_loss_mlp": 0.0, "epoch": 0.6839168911119662, "flos": 794173413888.0, "grad_norm": 0.06810293796091849, "language_loss": 0.7482394, "learning_rate": 0.0002399071815087821, "loss": 0.7588979, "num_input_tokens_seen": 294907504, "router_z_loss_mlp": 0.09521484, "routerloss_mlp": 0.0, "step": 3555, "time_per_iteration": 3.053788900375366 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064522, "balance_loss_mlp": 1.05496788, "diversity_loss_mlp": 0.0, "epoch": 0.6841092727972297, "flos": 580009973760.0, "grad_norm": 0.0721005752972134, "language_loss": 0.83788198, "learning_rate": 0.00023964115830142025, "loss": 0.84852719, "num_input_tokens_seen": 294977600, "router_z_loss_mlp": 0.09545898, "routerloss_mlp": 0.0, "step": 3556, "time_per_iteration": 2.7068707942962646 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062319, "balance_loss_mlp": 1.05320573, "diversity_loss_mlp": 0.0, "epoch": 0.6843016544824932, "flos": 383742738432.0, "grad_norm": 0.07897700130685587, "language_loss": 0.87426114, "learning_rate": 0.00023937523617432522, "loss": 0.88488424, "num_input_tokens_seen": 295039408, "router_z_loss_mlp": 0.09106445, "routerloss_mlp": 0.0, "step": 3557, "time_per_iteration": 2.526129722595215 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063804, "balance_loss_mlp": 1.05461264, "diversity_loss_mlp": 0.0, "epoch": 0.6844940361677568, "flos": 1439035476480.0, "grad_norm": 0.08002974259616906, "language_loss": 0.8704505, "learning_rate": 0.00023910941523073705, "loss": 0.88108861, "num_input_tokens_seen": 295142928, "router_z_loss_mlp": 0.09185791, "routerloss_mlp": 0.0, "step": 3558, "time_per_iteration": 3.884982109069824 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067156, "balance_loss_mlp": 1.05752969, "diversity_loss_mlp": 0.0, "epoch": 0.6846864178530204, "flos": 520870860288.0, "grad_norm": 0.0697798269972245, "language_loss": 0.86687434, "learning_rate": 0.0002388436955738566, "loss": 0.87754589, "num_input_tokens_seen": 295215504, "router_z_loss_mlp": 0.09619141, "routerloss_mlp": 0.0, "step": 3559, "time_per_iteration": 2.6896438598632812 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067142, "balance_loss_mlp": 1.05763495, "diversity_loss_mlp": 0.0, "epoch": 0.6848787995382839, "flos": 717946053120.0, "grad_norm": 0.07371598831130721, "language_loss": 0.81583881, "learning_rate": 0.00023857807730684523, "loss": 0.82651019, "num_input_tokens_seen": 295291024, "router_z_loss_mlp": 0.0949707, "routerloss_mlp": 0.0, "step": 3560, "time_per_iteration": 2.906409740447998 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070721, "balance_loss_mlp": 1.06119633, "diversity_loss_mlp": 0.0, "epoch": 0.6850711812235475, "flos": 511061571072.0, "grad_norm": 0.09020757950976771, "language_loss": 0.82591355, "learning_rate": 0.00023831256053282547, "loss": 0.83662075, "num_input_tokens_seen": 295363248, "router_z_loss_mlp": 0.09527588, "routerloss_mlp": 0.0, "step": 3561, "time_per_iteration": 2.741647481918335 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076955, "balance_loss_mlp": 1.06726301, "diversity_loss_mlp": 0.0, "epoch": 0.6852635629088111, "flos": 668151493632.0, "grad_norm": 0.06598100836979733, "language_loss": 0.7798056, "learning_rate": 0.00023804714535488003, "loss": 0.79057515, "num_input_tokens_seen": 295442032, "router_z_loss_mlp": 0.09680176, "routerloss_mlp": 0.0, "step": 3562, "time_per_iteration": 2.8663859367370605 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01022665, "balance_loss_mlp": 1.01694274, "diversity_loss_mlp": 0.0, "epoch": 0.6854559445940747, "flos": 1522980071424.0, "grad_norm": 0.018293527884891043, "language_loss": 0.7980963, "learning_rate": 0.0002377818318760519, "loss": 0.80832297, "num_input_tokens_seen": 295680560, "router_z_loss_mlp": 0.05712891, "routerloss_mlp": 0.0, "step": 3563, "time_per_iteration": 4.938952684402466 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076838, "balance_loss_mlp": 1.06765318, "diversity_loss_mlp": 0.0, "epoch": 0.6856483262793382, "flos": 454203168768.0, "grad_norm": 0.06579070354920068, "language_loss": 0.8089236, "learning_rate": 0.00023751662019934488, "loss": 0.81969196, "num_input_tokens_seen": 295745712, "router_z_loss_mlp": 0.09185791, "routerloss_mlp": 0.0, "step": 3564, "time_per_iteration": 2.4886345863342285 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01085968, "balance_loss_mlp": 1.07677126, "diversity_loss_mlp": 0.0, "epoch": 0.6858407079646017, "flos": 615552763392.0, "grad_norm": 0.06770513871895241, "language_loss": 0.79428673, "learning_rate": 0.00023725151042772364, "loss": 0.80514634, "num_input_tokens_seen": 295815104, "router_z_loss_mlp": 0.09191895, "routerloss_mlp": 0.0, "step": 3565, "time_per_iteration": 2.7136006355285645 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01091397, "balance_loss_mlp": 1.08220637, "diversity_loss_mlp": 0.0, "epoch": 0.6860330896498653, "flos": 466053087744.0, "grad_norm": 0.0657025292696896, "language_loss": 0.83245081, "learning_rate": 0.00023698650266411276, "loss": 0.84336478, "num_input_tokens_seen": 295882928, "router_z_loss_mlp": 0.09191895, "routerloss_mlp": 0.0, "step": 3566, "time_per_iteration": 2.619652032852173 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01087671, "balance_loss_mlp": 1.07844996, "diversity_loss_mlp": 0.0, "epoch": 0.6862254713351289, "flos": 864270425088.0, "grad_norm": 0.07570090303701395, "language_loss": 0.82732457, "learning_rate": 0.00023672159701139755, "loss": 0.83820128, "num_input_tokens_seen": 295970960, "router_z_loss_mlp": 0.09210205, "routerloss_mlp": 0.0, "step": 3567, "time_per_iteration": 3.2096190452575684 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01092795, "balance_loss_mlp": 1.08350825, "diversity_loss_mlp": 0.0, "epoch": 0.6864178530203925, "flos": 447141523968.0, "grad_norm": 0.07219945861824417, "language_loss": 0.86111134, "learning_rate": 0.00023645679357242296, "loss": 0.87203926, "num_input_tokens_seen": 296036128, "router_z_loss_mlp": 0.09283447, "routerloss_mlp": 0.0, "step": 3568, "time_per_iteration": 2.598115921020508 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00792363, "balance_loss_mlp": 1.34135008, "diversity_loss_mlp": 0.22022857, "epoch": 0.6866102347056561, "flos": 424269093888.0, "grad_norm": 0.03374979092207147, "language_loss": 0.84308195, "learning_rate": 0.00023619209244999534, "loss": 0.85100567, "num_input_tokens_seen": 296101440, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01157361, "step": 3569, "time_per_iteration": 2.647141695022583 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0108474, "balance_loss_mlp": 1.07559109, "diversity_loss_mlp": 0.0, "epoch": 0.6868026163909196, "flos": 472373586432.0, "grad_norm": 0.09720254317506574, "language_loss": 0.85017771, "learning_rate": 0.0002359274937468806, "loss": 0.86102515, "num_input_tokens_seen": 296165504, "router_z_loss_mlp": 0.0914917, "routerloss_mlp": 0.0, "step": 3570, "time_per_iteration": 2.5088424682617188 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080851, "balance_loss_mlp": 1.07149255, "diversity_loss_mlp": 0.0, "epoch": 0.6869949980761831, "flos": 464190124032.0, "grad_norm": 0.06491952507138833, "language_loss": 0.77798098, "learning_rate": 0.00023566299756580512, "loss": 0.78878951, "num_input_tokens_seen": 296236880, "router_z_loss_mlp": 0.09350586, "routerloss_mlp": 0.0, "step": 3571, "time_per_iteration": 2.6349782943725586 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080655, "balance_loss_mlp": 1.07132113, "diversity_loss_mlp": 0.0, "epoch": 0.6871873797614467, "flos": 426235944960.0, "grad_norm": 0.07205344290521438, "language_loss": 0.78495932, "learning_rate": 0.0002353986040094551, "loss": 0.79576588, "num_input_tokens_seen": 296299776, "router_z_loss_mlp": 0.09326172, "routerloss_mlp": 0.0, "step": 3572, "time_per_iteration": 2.4710493087768555 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079091, "balance_loss_mlp": 1.06974494, "diversity_loss_mlp": 0.0, "epoch": 0.6873797614467103, "flos": 443625569280.0, "grad_norm": 0.07195013135933294, "language_loss": 0.7977035, "learning_rate": 0.00023513431318047796, "loss": 0.80849445, "num_input_tokens_seen": 296365408, "router_z_loss_mlp": 0.09338379, "routerloss_mlp": 0.0, "step": 3573, "time_per_iteration": 2.5213143825531006 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01081479, "balance_loss_mlp": 1.07233512, "diversity_loss_mlp": 0.0, "epoch": 0.6875721431319738, "flos": 992323436544.0, "grad_norm": 0.0671999790126143, "language_loss": 0.77178657, "learning_rate": 0.00023487012518147977, "loss": 0.78260136, "num_input_tokens_seen": 296445488, "router_z_loss_mlp": 0.09155273, "routerloss_mlp": 0.0, "step": 3574, "time_per_iteration": 3.2319135665893555 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073879, "balance_loss_mlp": 1.06456256, "diversity_loss_mlp": 0.0, "epoch": 0.6877645248172374, "flos": 1285513638912.0, "grad_norm": 0.06898424741609648, "language_loss": 0.84452772, "learning_rate": 0.00023460604011502772, "loss": 0.85526657, "num_input_tokens_seen": 296529936, "router_z_loss_mlp": 0.09320068, "routerloss_mlp": 0.0, "step": 3575, "time_per_iteration": 3.8878557682037354 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075527, "balance_loss_mlp": 1.0666877, "diversity_loss_mlp": 0.0, "epoch": 0.687956906502501, "flos": 876733383168.0, "grad_norm": 0.0699577179930161, "language_loss": 0.85862118, "learning_rate": 0.00023434205808364845, "loss": 0.86937642, "num_input_tokens_seen": 296607488, "router_z_loss_mlp": 0.08850098, "routerloss_mlp": 0.0, "step": 3576, "time_per_iteration": 3.1633143424987793 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072972, "balance_loss_mlp": 1.06390619, "diversity_loss_mlp": 0.0, "epoch": 0.6881492881877646, "flos": 563324419584.0, "grad_norm": 0.07476899851847786, "language_loss": 0.85238355, "learning_rate": 0.00023407817918982932, "loss": 0.86311328, "num_input_tokens_seen": 296678672, "router_z_loss_mlp": 0.09075928, "routerloss_mlp": 0.0, "step": 3577, "time_per_iteration": 2.7126357555389404 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075894, "balance_loss_mlp": 1.06677413, "diversity_loss_mlp": 0.0, "epoch": 0.6883416698730281, "flos": 795127104000.0, "grad_norm": 0.07427735671199864, "language_loss": 0.78816962, "learning_rate": 0.00023381440353601718, "loss": 0.79892862, "num_input_tokens_seen": 296758896, "router_z_loss_mlp": 0.09124756, "routerloss_mlp": 0.0, "step": 3578, "time_per_iteration": 2.9925150871276855 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069003, "balance_loss_mlp": 1.05976987, "diversity_loss_mlp": 0.0, "epoch": 0.6885340515582916, "flos": 723621579264.0, "grad_norm": 0.07604251893794473, "language_loss": 0.86125422, "learning_rate": 0.00023355073122461822, "loss": 0.87194419, "num_input_tokens_seen": 296830736, "router_z_loss_mlp": 0.09228516, "routerloss_mlp": 0.0, "step": 3579, "time_per_iteration": 2.938112258911133 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065285, "balance_loss_mlp": 1.05620754, "diversity_loss_mlp": 0.0, "epoch": 0.6887264332435552, "flos": 1010926282752.0, "grad_norm": 0.06357801718819331, "language_loss": 0.82597542, "learning_rate": 0.00023328716235799973, "loss": 0.83662832, "num_input_tokens_seen": 296911504, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 3580, "time_per_iteration": 3.2711336612701416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066362, "balance_loss_mlp": 1.05755877, "diversity_loss_mlp": 0.0, "epoch": 0.6889188149288188, "flos": 585262983168.0, "grad_norm": 0.07922172227575792, "language_loss": 0.84162283, "learning_rate": 0.00023302369703848803, "loss": 0.85228646, "num_input_tokens_seen": 296981488, "router_z_loss_mlp": 0.08807373, "routerloss_mlp": 0.0, "step": 3581, "time_per_iteration": 2.8185226917266846 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069197, "balance_loss_mlp": 1.06004775, "diversity_loss_mlp": 0.0, "epoch": 0.6891111966140824, "flos": 636119889408.0, "grad_norm": 0.07416922878209098, "language_loss": 0.79931486, "learning_rate": 0.00023276033536836937, "loss": 0.81000686, "num_input_tokens_seen": 297054896, "router_z_loss_mlp": 0.0914917, "routerloss_mlp": 0.0, "step": 3582, "time_per_iteration": 2.844299554824829 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061227, "balance_loss_mlp": 1.05179787, "diversity_loss_mlp": 0.0, "epoch": 0.6893035782993459, "flos": 495270609408.0, "grad_norm": 0.06489183727188522, "language_loss": 0.85119617, "learning_rate": 0.00023249707744988984, "loss": 0.86180842, "num_input_tokens_seen": 297128224, "router_z_loss_mlp": 0.09429932, "routerloss_mlp": 0.0, "step": 3583, "time_per_iteration": 2.701711654663086 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060915, "balance_loss_mlp": 1.05140829, "diversity_loss_mlp": 0.0, "epoch": 0.6894959599846094, "flos": 458215792128.0, "grad_norm": 0.07019303893436639, "language_loss": 0.82148254, "learning_rate": 0.00023223392338525529, "loss": 0.83209163, "num_input_tokens_seen": 297191312, "router_z_loss_mlp": 0.09503174, "routerloss_mlp": 0.0, "step": 3584, "time_per_iteration": 2.5167200565338135 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053502, "balance_loss_mlp": 1.04406083, "diversity_loss_mlp": 0.0, "epoch": 0.689688341669873, "flos": 505003175424.0, "grad_norm": 0.06639305906088179, "language_loss": 0.78639823, "learning_rate": 0.00023197087327663107, "loss": 0.79693329, "num_input_tokens_seen": 297261904, "router_z_loss_mlp": 0.09436035, "routerloss_mlp": 0.0, "step": 3585, "time_per_iteration": 2.6349897384643555 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057819, "balance_loss_mlp": 1.04834747, "diversity_loss_mlp": 0.0, "epoch": 0.6898807233551366, "flos": 763910797824.0, "grad_norm": 0.0732534701091779, "language_loss": 0.81201088, "learning_rate": 0.00023170792722614243, "loss": 0.82258916, "num_input_tokens_seen": 297338352, "router_z_loss_mlp": 0.09454346, "routerloss_mlp": 0.0, "step": 3586, "time_per_iteration": 2.9198050498962402 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056115, "balance_loss_mlp": 1.04651892, "diversity_loss_mlp": 0.0, "epoch": 0.6900731050404002, "flos": 583337977344.0, "grad_norm": 0.06720533838288198, "language_loss": 0.83776879, "learning_rate": 0.00023144508533587377, "loss": 0.84832996, "num_input_tokens_seen": 297416688, "router_z_loss_mlp": 0.09588623, "routerloss_mlp": 0.0, "step": 3587, "time_per_iteration": 2.8723502159118652 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054327, "balance_loss_mlp": 1.04436147, "diversity_loss_mlp": 0.0, "epoch": 0.6902654867256637, "flos": 711865262592.0, "grad_norm": 0.07065225941485688, "language_loss": 0.78699905, "learning_rate": 0.0002311823477078698, "loss": 0.79754233, "num_input_tokens_seen": 297499968, "router_z_loss_mlp": 0.09960938, "routerloss_mlp": 0.0, "step": 3588, "time_per_iteration": 2.9407894611358643 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054694, "balance_loss_mlp": 1.04507959, "diversity_loss_mlp": 0.0, "epoch": 0.6904578684109273, "flos": 597112902144.0, "grad_norm": 0.0778571388662146, "language_loss": 0.85240763, "learning_rate": 0.00023091971444413428, "loss": 0.8629545, "num_input_tokens_seen": 297574480, "router_z_loss_mlp": 0.0960083, "routerloss_mlp": 0.0, "step": 3589, "time_per_iteration": 2.796943187713623 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054005, "balance_loss_mlp": 1.04385448, "diversity_loss_mlp": 0.0, "epoch": 0.6906502500961909, "flos": 585040527360.0, "grad_norm": 0.0732795678952718, "language_loss": 0.82600373, "learning_rate": 0.00023065718564663012, "loss": 0.8365438, "num_input_tokens_seen": 297645360, "router_z_loss_mlp": 0.1015625, "routerloss_mlp": 0.0, "step": 3590, "time_per_iteration": 2.742586135864258 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01010537, "balance_loss_mlp": 1.00519681, "diversity_loss_mlp": 0.0, "epoch": 0.6908426317814544, "flos": 1587827017728.0, "grad_norm": 0.012465594930310886, "language_loss": 0.73911589, "learning_rate": 0.00023039476141728011, "loss": 0.74922127, "num_input_tokens_seen": 297879472, "router_z_loss_mlp": 0.0534668, "routerloss_mlp": 0.0, "step": 3591, "time_per_iteration": 4.981812477111816 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0079259, "balance_loss_mlp": 1.34177041, "diversity_loss_mlp": 0.2198928, "epoch": 0.6910350134667179, "flos": 500780579328.0, "grad_norm": 0.028847197535296083, "language_loss": 0.80689478, "learning_rate": 0.0002301324418579666, "loss": 0.81482071, "num_input_tokens_seen": 297950672, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0117582, "step": 3592, "time_per_iteration": 2.71809983253479 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0058906, "balance_loss_mlp": 1.01557088, "diversity_loss_mlp": 0.14263315, "epoch": 0.6912273951519815, "flos": 1409194257408.0, "grad_norm": 0.0010924650790030575, "language_loss": 0.78688473, "learning_rate": 0.00022987022707053107, "loss": 0.79277533, "num_input_tokens_seen": 298171728, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.00995804, "step": 3593, "time_per_iteration": 4.800194263458252 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064196, "balance_loss_mlp": 1.05474234, "diversity_loss_mlp": 0.0, "epoch": 0.6914197768372451, "flos": 635279625216.0, "grad_norm": 0.08227146788009188, "language_loss": 0.80700612, "learning_rate": 0.00022960811715677415, "loss": 0.81764805, "num_input_tokens_seen": 298250304, "router_z_loss_mlp": 0.09442139, "routerloss_mlp": 0.0, "step": 3594, "time_per_iteration": 2.8780887126922607 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065404, "balance_loss_mlp": 1.05574787, "diversity_loss_mlp": 0.0, "epoch": 0.6916121585225087, "flos": 558044246016.0, "grad_norm": 0.06283622806249096, "language_loss": 0.82029772, "learning_rate": 0.00022934611221845608, "loss": 0.83095175, "num_input_tokens_seen": 298328000, "router_z_loss_mlp": 0.09661865, "routerloss_mlp": 0.0, "step": 3595, "time_per_iteration": 2.80785870552063 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062245, "balance_loss_mlp": 1.05264866, "diversity_loss_mlp": 0.0, "epoch": 0.6918045402077723, "flos": 529167748608.0, "grad_norm": 0.07415067488634865, "language_loss": 0.77666163, "learning_rate": 0.00022908421235729609, "loss": 0.78728402, "num_input_tokens_seen": 298406832, "router_z_loss_mlp": 0.09588623, "routerloss_mlp": 0.0, "step": 3596, "time_per_iteration": 2.75410795211792 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065698, "balance_loss_mlp": 1.05607235, "diversity_loss_mlp": 0.0, "epoch": 0.6919969218930357, "flos": 570351559680.0, "grad_norm": 0.06984612144500793, "language_loss": 0.8509379, "learning_rate": 0.0002288224176749728, "loss": 0.86159492, "num_input_tokens_seen": 298477584, "router_z_loss_mlp": 0.09613037, "routerloss_mlp": 0.0, "step": 3597, "time_per_iteration": 2.670696258544922 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070664, "balance_loss_mlp": 1.06105542, "diversity_loss_mlp": 0.0, "epoch": 0.6921893035782993, "flos": 683305196544.0, "grad_norm": 0.1037313094960325, "language_loss": 0.78704476, "learning_rate": 0.00022856072827312385, "loss": 0.79775131, "num_input_tokens_seen": 298551872, "router_z_loss_mlp": 0.09606934, "routerloss_mlp": 0.0, "step": 3598, "time_per_iteration": 2.795475959777832 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106581, "balance_loss_mlp": 1.05624998, "diversity_loss_mlp": 0.0, "epoch": 0.6923816852635629, "flos": 546745324032.0, "grad_norm": 0.06439958207329444, "language_loss": 0.77316082, "learning_rate": 0.00022829914425334598, "loss": 0.78381896, "num_input_tokens_seen": 298619680, "router_z_loss_mlp": 0.09558105, "routerloss_mlp": 0.0, "step": 3599, "time_per_iteration": 2.6179866790771484 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064235, "balance_loss_mlp": 1.05483484, "diversity_loss_mlp": 0.0, "epoch": 0.6925740669488265, "flos": 510036300288.0, "grad_norm": 0.06408780313496462, "language_loss": 0.80725557, "learning_rate": 0.0002280376657171956, "loss": 0.81789792, "num_input_tokens_seen": 298690080, "router_z_loss_mlp": 0.09393311, "routerloss_mlp": 0.0, "step": 3600, "time_per_iteration": 2.633162021636963 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064214, "balance_loss_mlp": 1.05445051, "diversity_loss_mlp": 0.0, "epoch": 0.69276644863409, "flos": 869424689664.0, "grad_norm": 0.07377083778937557, "language_loss": 0.76414573, "learning_rate": 0.00022777629276618706, "loss": 0.77478784, "num_input_tokens_seen": 298777712, "router_z_loss_mlp": 0.09759521, "routerloss_mlp": 0.0, "step": 3601, "time_per_iteration": 3.0916104316711426 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065762, "balance_loss_mlp": 1.05597496, "diversity_loss_mlp": 0.0, "epoch": 0.6929588303193536, "flos": 625772086272.0, "grad_norm": 0.06702562864271609, "language_loss": 0.77948666, "learning_rate": 0.0002275150255017947, "loss": 0.79014426, "num_input_tokens_seen": 298854368, "router_z_loss_mlp": 0.09777832, "routerloss_mlp": 0.0, "step": 3602, "time_per_iteration": 2.7668936252593994 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01012943, "balance_loss_mlp": 1.00765014, "diversity_loss_mlp": 0.0, "epoch": 0.6931512120046172, "flos": 1545382996992.0, "grad_norm": 0.010670435186768691, "language_loss": 0.75732672, "learning_rate": 0.0002272538640254511, "loss": 0.76745617, "num_input_tokens_seen": 299091664, "router_z_loss_mlp": 0.05297852, "routerloss_mlp": 0.0, "step": 3603, "time_per_iteration": 5.010159492492676 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01011501, "balance_loss_mlp": 1.00618434, "diversity_loss_mlp": 0.0, "epoch": 0.6933435936898807, "flos": 1448230606848.0, "grad_norm": 0.00963913060826947, "language_loss": 0.75127101, "learning_rate": 0.0002269928084385487, "loss": 0.76138604, "num_input_tokens_seen": 299312656, "router_z_loss_mlp": 0.05322266, "routerloss_mlp": 0.0, "step": 3604, "time_per_iteration": 4.7926812171936035 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061095, "balance_loss_mlp": 1.05157018, "diversity_loss_mlp": 0.0, "epoch": 0.6935359753751443, "flos": 540896901120.0, "grad_norm": 0.06111799581134822, "language_loss": 0.84283471, "learning_rate": 0.0002267318588424379, "loss": 0.85344565, "num_input_tokens_seen": 299381136, "router_z_loss_mlp": 0.09521484, "routerloss_mlp": 0.0, "step": 3605, "time_per_iteration": 2.732388496398926 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056566, "balance_loss_mlp": 1.04717207, "diversity_loss_mlp": 0.0, "epoch": 0.6937283570604078, "flos": 719396411904.0, "grad_norm": 0.07244313312376265, "language_loss": 0.87551069, "learning_rate": 0.00022647101533842845, "loss": 0.88607633, "num_input_tokens_seen": 299455216, "router_z_loss_mlp": 0.09387207, "routerloss_mlp": 0.0, "step": 3606, "time_per_iteration": 3.001912832260132 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058252, "balance_loss_mlp": 1.04882836, "diversity_loss_mlp": 0.0, "epoch": 0.6939207387456714, "flos": 522165574656.0, "grad_norm": 0.07498146805012186, "language_loss": 0.76334918, "learning_rate": 0.00022621027802778872, "loss": 0.77393162, "num_input_tokens_seen": 299524352, "router_z_loss_mlp": 0.09405518, "routerloss_mlp": 0.0, "step": 3607, "time_per_iteration": 2.6257400512695312 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052774, "balance_loss_mlp": 1.04345798, "diversity_loss_mlp": 0.0, "epoch": 0.694113120430935, "flos": 535359767040.0, "grad_norm": 0.07029819881410336, "language_loss": 0.78756207, "learning_rate": 0.00022594964701174586, "loss": 0.79808986, "num_input_tokens_seen": 299594960, "router_z_loss_mlp": 0.09320068, "routerloss_mlp": 0.0, "step": 3608, "time_per_iteration": 2.6099236011505127 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065561, "balance_loss_mlp": 1.05642402, "diversity_loss_mlp": 0.0, "epoch": 0.6943055021161986, "flos": 523358972928.0, "grad_norm": 0.10152593614861574, "language_loss": 0.84643018, "learning_rate": 0.00022568912239148586, "loss": 0.85708582, "num_input_tokens_seen": 299662560, "router_z_loss_mlp": 0.09136963, "routerloss_mlp": 0.0, "step": 3609, "time_per_iteration": 2.6678829193115234 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059207, "balance_loss_mlp": 1.04986095, "diversity_loss_mlp": 0.0, "epoch": 0.694497883801462, "flos": 484902982656.0, "grad_norm": 0.06906376751770449, "language_loss": 0.81638551, "learning_rate": 0.00022542870426815344, "loss": 0.82697761, "num_input_tokens_seen": 299734896, "router_z_loss_mlp": 0.09344482, "routerloss_mlp": 0.0, "step": 3610, "time_per_iteration": 2.69460129737854 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058231, "balance_loss_mlp": 1.04869449, "diversity_loss_mlp": 0.0, "epoch": 0.6946902654867256, "flos": 461474786304.0, "grad_norm": 0.07528135941421366, "language_loss": 0.86051476, "learning_rate": 0.00022516839274285173, "loss": 0.87109709, "num_input_tokens_seen": 299799424, "router_z_loss_mlp": 0.09533691, "routerloss_mlp": 0.0, "step": 3611, "time_per_iteration": 2.5634658336639404 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063678, "balance_loss_mlp": 1.05389714, "diversity_loss_mlp": 0.0, "epoch": 0.6948826471719892, "flos": 512855525376.0, "grad_norm": 0.06331906344074151, "language_loss": 0.7521888, "learning_rate": 0.00022490818791664265, "loss": 0.76282561, "num_input_tokens_seen": 299868272, "router_z_loss_mlp": 0.09777832, "routerloss_mlp": 0.0, "step": 3612, "time_per_iteration": 2.617492437362671 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067849, "balance_loss_mlp": 1.05837226, "diversity_loss_mlp": 0.0, "epoch": 0.6950750288572528, "flos": 557184531456.0, "grad_norm": 0.05946591075452152, "language_loss": 0.85666263, "learning_rate": 0.00022464808989054676, "loss": 0.86734116, "num_input_tokens_seen": 299939136, "router_z_loss_mlp": 0.09466553, "routerloss_mlp": 0.0, "step": 3613, "time_per_iteration": 2.6678874492645264 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00789837, "balance_loss_mlp": 1.33770788, "diversity_loss_mlp": 0.21965824, "epoch": 0.6952674105425164, "flos": 542475740160.0, "grad_norm": 0.03604068217542595, "language_loss": 0.76138353, "learning_rate": 0.00022438809876554284, "loss": 0.76928186, "num_input_tokens_seen": 300009472, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01115366, "step": 3614, "time_per_iteration": 2.6613171100616455 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070097, "balance_loss_mlp": 1.0602442, "diversity_loss_mlp": 0.0, "epoch": 0.6954597922277799, "flos": 546742752768.0, "grad_norm": 0.08971125257054285, "language_loss": 0.80425173, "learning_rate": 0.00022412821464256873, "loss": 0.81495273, "num_input_tokens_seen": 300081008, "router_z_loss_mlp": 0.09844971, "routerloss_mlp": 0.0, "step": 3615, "time_per_iteration": 2.7288718223571777 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071192, "balance_loss_mlp": 1.06157804, "diversity_loss_mlp": 0.0, "epoch": 0.6956521739130435, "flos": 519511905792.0, "grad_norm": 0.07384702921709109, "language_loss": 0.82342923, "learning_rate": 0.00022386843762252023, "loss": 0.83414114, "num_input_tokens_seen": 300149856, "router_z_loss_mlp": 0.09619141, "routerloss_mlp": 0.0, "step": 3616, "time_per_iteration": 2.5761711597442627 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106737, "balance_loss_mlp": 1.0575707, "diversity_loss_mlp": 0.0, "epoch": 0.695844555598307, "flos": 466275543552.0, "grad_norm": 0.07908443617567998, "language_loss": 0.79798818, "learning_rate": 0.00022360876780625193, "loss": 0.80866194, "num_input_tokens_seen": 300217344, "router_z_loss_mlp": 0.09790039, "routerloss_mlp": 0.0, "step": 3617, "time_per_iteration": 2.6008386611938477 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059868, "balance_loss_mlp": 1.05015886, "diversity_loss_mlp": 0.0, "epoch": 0.6960369372835706, "flos": 600663361536.0, "grad_norm": 0.07021226627677062, "language_loss": 0.80116498, "learning_rate": 0.00022334920529457604, "loss": 0.81176364, "num_input_tokens_seen": 300305584, "router_z_loss_mlp": 0.0970459, "routerloss_mlp": 0.0, "step": 3618, "time_per_iteration": 2.9185733795166016 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105864, "balance_loss_mlp": 1.04876924, "diversity_loss_mlp": 0.0, "epoch": 0.6962293189688342, "flos": 644233969152.0, "grad_norm": 0.05697997760775425, "language_loss": 0.87189567, "learning_rate": 0.00022308975018826423, "loss": 0.88248205, "num_input_tokens_seen": 300386480, "router_z_loss_mlp": 0.09863281, "routerloss_mlp": 0.0, "step": 3619, "time_per_iteration": 2.927544355392456 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054275, "balance_loss_mlp": 1.04414856, "diversity_loss_mlp": 0.0, "epoch": 0.6964217006540977, "flos": 638810634240.0, "grad_norm": 0.0740354998090604, "language_loss": 0.84932256, "learning_rate": 0.00022283040258804564, "loss": 0.85986531, "num_input_tokens_seen": 300461840, "router_z_loss_mlp": 0.10125732, "routerloss_mlp": 0.0, "step": 3620, "time_per_iteration": 2.755613327026367 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00787662, "balance_loss_mlp": 1.33203387, "diversity_loss_mlp": 0.22018704, "epoch": 0.6966140823393613, "flos": 652167811584.0, "grad_norm": 0.033538632644234186, "language_loss": 0.83875167, "learning_rate": 0.00022257116259460802, "loss": 0.84662825, "num_input_tokens_seen": 300540400, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01155162, "step": 3621, "time_per_iteration": 2.844062089920044 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047866, "balance_loss_mlp": 1.03843641, "diversity_loss_mlp": 0.0, "epoch": 0.6968064640246249, "flos": 704492328960.0, "grad_norm": 0.06349986715080715, "language_loss": 0.81602001, "learning_rate": 0.00022231203030859725, "loss": 0.82649869, "num_input_tokens_seen": 300624240, "router_z_loss_mlp": 0.09429932, "routerloss_mlp": 0.0, "step": 3622, "time_per_iteration": 2.9582505226135254 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053512, "balance_loss_mlp": 1.04382682, "diversity_loss_mlp": 0.0, "epoch": 0.6969988457098885, "flos": 492555271680.0, "grad_norm": 0.09473470519326596, "language_loss": 0.83760095, "learning_rate": 0.00022205300583061737, "loss": 0.84813607, "num_input_tokens_seen": 300689728, "router_z_loss_mlp": 0.09674072, "routerloss_mlp": 0.0, "step": 3623, "time_per_iteration": 2.5727412700653076 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01016252, "balance_loss_mlp": 1.01057744, "diversity_loss_mlp": 0.0, "epoch": 0.6971912273951519, "flos": 1352592442368.0, "grad_norm": 0.01746847385777515, "language_loss": 0.82838202, "learning_rate": 0.00022179408926123063, "loss": 0.83854461, "num_input_tokens_seen": 300913152, "router_z_loss_mlp": 0.05664062, "routerloss_mlp": 0.0, "step": 3624, "time_per_iteration": 4.8940582275390625 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051503, "balance_loss_mlp": 1.04190028, "diversity_loss_mlp": 0.0, "epoch": 0.6973836090804155, "flos": 602459887104.0, "grad_norm": 0.07214179790538137, "language_loss": 0.77598304, "learning_rate": 0.00022153528070095735, "loss": 0.78649807, "num_input_tokens_seen": 300985824, "router_z_loss_mlp": 0.09594727, "routerloss_mlp": 0.0, "step": 3625, "time_per_iteration": 2.694251298904419 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049268, "balance_loss_mlp": 1.03960037, "diversity_loss_mlp": 0.0, "epoch": 0.6975759907656791, "flos": 524065614336.0, "grad_norm": 0.07542787145084529, "language_loss": 0.88381326, "learning_rate": 0.00022127658025027568, "loss": 0.89430594, "num_input_tokens_seen": 301058048, "router_z_loss_mlp": 0.09655762, "routerloss_mlp": 0.0, "step": 3626, "time_per_iteration": 2.6595661640167236 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053933, "balance_loss_mlp": 1.04412818, "diversity_loss_mlp": 0.0, "epoch": 0.6977683724509427, "flos": 480912754176.0, "grad_norm": 0.08038583191357998, "language_loss": 0.85689813, "learning_rate": 0.00022101798800962258, "loss": 0.86743748, "num_input_tokens_seen": 301127472, "router_z_loss_mlp": 0.0980835, "routerloss_mlp": 0.0, "step": 3627, "time_per_iteration": 2.6137661933898926 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057745, "balance_loss_mlp": 1.04847646, "diversity_loss_mlp": 0.0, "epoch": 0.6979607541362063, "flos": 522625167360.0, "grad_norm": 0.08075391789271535, "language_loss": 0.78634858, "learning_rate": 0.00022075950407939227, "loss": 0.79692602, "num_input_tokens_seen": 301193920, "router_z_loss_mlp": 0.09265137, "routerloss_mlp": 0.0, "step": 3628, "time_per_iteration": 2.6296188831329346 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059141, "balance_loss_mlp": 1.04959214, "diversity_loss_mlp": 0.0, "epoch": 0.6981531358214698, "flos": 548077114368.0, "grad_norm": 0.0897351301563825, "language_loss": 0.8281461, "learning_rate": 0.0002205011285599367, "loss": 0.83873749, "num_input_tokens_seen": 301264256, "router_z_loss_mlp": 0.09539795, "routerloss_mlp": 0.0, "step": 3629, "time_per_iteration": 2.6147000789642334 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0079513, "balance_loss_mlp": 1.34714937, "diversity_loss_mlp": 0.21970588, "epoch": 0.6983455175067333, "flos": 700052419584.0, "grad_norm": 0.029792453728032804, "language_loss": 0.80962801, "learning_rate": 0.00022024286155156658, "loss": 0.81757927, "num_input_tokens_seen": 301337696, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01170244, "step": 3630, "time_per_iteration": 2.8613815307617188 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058875, "balance_loss_mlp": 1.04967785, "diversity_loss_mlp": 0.0, "epoch": 0.6985378991919969, "flos": 485078450688.0, "grad_norm": 0.10033041150535157, "language_loss": 0.86079919, "learning_rate": 0.00021998470315454994, "loss": 0.87138796, "num_input_tokens_seen": 301407776, "router_z_loss_mlp": 0.09191895, "routerloss_mlp": 0.0, "step": 3631, "time_per_iteration": 2.647185802459717 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061381, "balance_loss_mlp": 1.05195761, "diversity_loss_mlp": 0.0, "epoch": 0.6987302808772605, "flos": 558780622848.0, "grad_norm": 0.06594571513985185, "language_loss": 0.86829215, "learning_rate": 0.00021972665346911275, "loss": 0.87890601, "num_input_tokens_seen": 301475120, "router_z_loss_mlp": 0.09411621, "routerloss_mlp": 0.0, "step": 3632, "time_per_iteration": 2.757704257965088 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065424, "balance_loss_mlp": 1.05622673, "diversity_loss_mlp": 0.0, "epoch": 0.698922662562524, "flos": 483593587200.0, "grad_norm": 0.06824207534465764, "language_loss": 0.79957312, "learning_rate": 0.00021946871259543877, "loss": 0.81022739, "num_input_tokens_seen": 301542416, "router_z_loss_mlp": 0.09197998, "routerloss_mlp": 0.0, "step": 3633, "time_per_iteration": 2.577909231185913 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063518, "balance_loss_mlp": 1.05467892, "diversity_loss_mlp": 0.0, "epoch": 0.6991150442477876, "flos": 718909655040.0, "grad_norm": 0.08329780404335202, "language_loss": 0.83364546, "learning_rate": 0.00021921088063366957, "loss": 0.84428072, "num_input_tokens_seen": 301620672, "router_z_loss_mlp": 0.08850098, "routerloss_mlp": 0.0, "step": 3634, "time_per_iteration": 2.933506965637207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106547, "balance_loss_mlp": 1.05625534, "diversity_loss_mlp": 0.0, "epoch": 0.6993074259330512, "flos": 489128150016.0, "grad_norm": 0.06097911291290099, "language_loss": 0.81932688, "learning_rate": 0.00021895315768390435, "loss": 0.82998157, "num_input_tokens_seen": 301688016, "router_z_loss_mlp": 0.09222412, "routerloss_mlp": 0.0, "step": 3635, "time_per_iteration": 2.6155378818511963 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071151, "balance_loss_mlp": 1.06179357, "diversity_loss_mlp": 0.0, "epoch": 0.6994998076183148, "flos": 718089214464.0, "grad_norm": 0.05851098027896569, "language_loss": 0.87547219, "learning_rate": 0.00021869554384619999, "loss": 0.88618374, "num_input_tokens_seen": 301771184, "router_z_loss_mlp": 0.09350586, "routerloss_mlp": 0.0, "step": 3636, "time_per_iteration": 2.9845876693725586 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106866, "balance_loss_mlp": 1.05937409, "diversity_loss_mlp": 0.0, "epoch": 0.6996921893035783, "flos": 579016636416.0, "grad_norm": 0.066101183722826, "language_loss": 0.80819213, "learning_rate": 0.00021843803922057115, "loss": 0.81887871, "num_input_tokens_seen": 301844528, "router_z_loss_mlp": 0.09277344, "routerloss_mlp": 0.0, "step": 3637, "time_per_iteration": 2.736743688583374 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069376, "balance_loss_mlp": 1.060215, "diversity_loss_mlp": 0.0, "epoch": 0.6998845709888418, "flos": 518629796352.0, "grad_norm": 0.07934438223674636, "language_loss": 0.8197611, "learning_rate": 0.00021818064390698977, "loss": 0.83045483, "num_input_tokens_seen": 301914960, "router_z_loss_mlp": 0.0916748, "routerloss_mlp": 0.0, "step": 3638, "time_per_iteration": 2.6075611114501953 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070673, "balance_loss_mlp": 1.06178594, "diversity_loss_mlp": 0.0, "epoch": 0.7000769526741054, "flos": 620951505408.0, "grad_norm": 0.0705113992952529, "language_loss": 0.87237096, "learning_rate": 0.0002179233580053861, "loss": 0.88307768, "num_input_tokens_seen": 301986352, "router_z_loss_mlp": 0.08892822, "routerloss_mlp": 0.0, "step": 3639, "time_per_iteration": 2.7142910957336426 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107015, "balance_loss_mlp": 1.06120896, "diversity_loss_mlp": 0.0, "epoch": 0.700269334359369, "flos": 559946856960.0, "grad_norm": 0.07560028355572443, "language_loss": 0.85636085, "learning_rate": 0.00021766618161564688, "loss": 0.86706233, "num_input_tokens_seen": 302060544, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 3640, "time_per_iteration": 2.7285115718841553 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065402, "balance_loss_mlp": 1.0562886, "diversity_loss_mlp": 0.0, "epoch": 0.7004617160446326, "flos": 483343967232.0, "grad_norm": 0.06395770762467583, "language_loss": 0.87343419, "learning_rate": 0.00021740911483761677, "loss": 0.88408822, "num_input_tokens_seen": 302127232, "router_z_loss_mlp": 0.09106445, "routerloss_mlp": 0.0, "step": 3641, "time_per_iteration": 2.584667205810547 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068314, "balance_loss_mlp": 1.05936706, "diversity_loss_mlp": 0.0, "epoch": 0.7006540977298961, "flos": 696981003264.0, "grad_norm": 0.05940351360925286, "language_loss": 0.91777283, "learning_rate": 0.00021715215777109837, "loss": 0.92845595, "num_input_tokens_seen": 302207056, "router_z_loss_mlp": 0.08947754, "routerloss_mlp": 0.0, "step": 3642, "time_per_iteration": 2.9933156967163086 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069259, "balance_loss_mlp": 1.06025815, "diversity_loss_mlp": 0.0, "epoch": 0.7008464794151597, "flos": 504775950336.0, "grad_norm": 0.07347565488383569, "language_loss": 0.84518594, "learning_rate": 0.00021689531051585103, "loss": 0.85587853, "num_input_tokens_seen": 302275632, "router_z_loss_mlp": 0.09002686, "routerloss_mlp": 0.0, "step": 3643, "time_per_iteration": 2.6531710624694824 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067704, "balance_loss_mlp": 1.05844164, "diversity_loss_mlp": 0.0, "epoch": 0.7010388611004232, "flos": 537242554368.0, "grad_norm": 0.08696231717445767, "language_loss": 0.80713868, "learning_rate": 0.00021663857317159196, "loss": 0.81781578, "num_input_tokens_seen": 302343600, "router_z_loss_mlp": 0.09265137, "routerloss_mlp": 0.0, "step": 3644, "time_per_iteration": 2.604703426361084 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072761, "balance_loss_mlp": 1.06396961, "diversity_loss_mlp": 0.0, "epoch": 0.7012312427856868, "flos": 547259245056.0, "grad_norm": 0.057193672258815845, "language_loss": 0.81973934, "learning_rate": 0.00021638194583799487, "loss": 0.83046699, "num_input_tokens_seen": 302414656, "router_z_loss_mlp": 0.08795166, "routerloss_mlp": 0.0, "step": 3645, "time_per_iteration": 2.6747145652770996 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067445, "balance_loss_mlp": 1.05851054, "diversity_loss_mlp": 0.0, "epoch": 0.7014236244709504, "flos": 941409630720.0, "grad_norm": 0.08498226844175927, "language_loss": 0.82551372, "learning_rate": 0.00021612542861469176, "loss": 0.83618826, "num_input_tokens_seen": 302495120, "router_z_loss_mlp": 0.08935547, "routerloss_mlp": 0.0, "step": 3646, "time_per_iteration": 3.2375802993774414 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067002, "balance_loss_mlp": 1.05810285, "diversity_loss_mlp": 0.0, "epoch": 0.7016160061562139, "flos": 525167608320.0, "grad_norm": 0.07003978186883456, "language_loss": 0.8260622, "learning_rate": 0.00021586902160127135, "loss": 0.83673215, "num_input_tokens_seen": 302563024, "router_z_loss_mlp": 0.08898926, "routerloss_mlp": 0.0, "step": 3647, "time_per_iteration": 2.6448206901550293 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076859, "balance_loss_mlp": 1.06791854, "diversity_loss_mlp": 0.0, "epoch": 0.7018083878414775, "flos": 373385023488.0, "grad_norm": 0.11788208419913924, "language_loss": 0.74163634, "learning_rate": 0.00021561272489727974, "loss": 0.75240493, "num_input_tokens_seen": 302624544, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 3648, "time_per_iteration": 2.5040485858917236 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107998, "balance_loss_mlp": 1.07128358, "diversity_loss_mlp": 0.0, "epoch": 0.7020007695267411, "flos": 527784201216.0, "grad_norm": 0.06337788759133205, "language_loss": 0.8008945, "learning_rate": 0.0002153565386022199, "loss": 0.81169432, "num_input_tokens_seen": 302697856, "router_z_loss_mlp": 0.08703613, "routerloss_mlp": 0.0, "step": 3649, "time_per_iteration": 2.7248024940490723 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076924, "balance_loss_mlp": 1.06812, "diversity_loss_mlp": 0.0, "epoch": 0.7021931512120047, "flos": 690154297344.0, "grad_norm": 0.0801860998557123, "language_loss": 0.82855487, "learning_rate": 0.00021510046281555262, "loss": 0.83932412, "num_input_tokens_seen": 302771984, "router_z_loss_mlp": 0.08813477, "routerloss_mlp": 0.0, "step": 3650, "time_per_iteration": 2.809051036834717 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077447, "balance_loss_mlp": 1.06870925, "diversity_loss_mlp": 0.0, "epoch": 0.7023855328972681, "flos": 639784147968.0, "grad_norm": 0.08542793543919469, "language_loss": 0.81736684, "learning_rate": 0.0002148444976366949, "loss": 0.82814133, "num_input_tokens_seen": 302838832, "router_z_loss_mlp": 0.08740234, "routerloss_mlp": 0.0, "step": 3651, "time_per_iteration": 2.7492573261260986 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01084402, "balance_loss_mlp": 1.07583714, "diversity_loss_mlp": 0.0, "epoch": 0.7025779145825317, "flos": 560940194304.0, "grad_norm": 0.0799718694707253, "language_loss": 0.82820916, "learning_rate": 0.00021458864316502136, "loss": 0.83905321, "num_input_tokens_seen": 302909952, "router_z_loss_mlp": 0.08575439, "routerloss_mlp": 0.0, "step": 3652, "time_per_iteration": 2.7140626907348633 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082394, "balance_loss_mlp": 1.07368028, "diversity_loss_mlp": 0.0, "epoch": 0.7027702962677953, "flos": 447445472256.0, "grad_norm": 0.0716785593922181, "language_loss": 0.87417138, "learning_rate": 0.0002143328994998634, "loss": 0.88499534, "num_input_tokens_seen": 302973056, "router_z_loss_mlp": 0.0871582, "routerloss_mlp": 0.0, "step": 3653, "time_per_iteration": 2.5076870918273926 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074305, "balance_loss_mlp": 1.06541252, "diversity_loss_mlp": 0.0, "epoch": 0.7029626779530589, "flos": 622500609024.0, "grad_norm": 0.078552736129926, "language_loss": 0.78368807, "learning_rate": 0.00021407726674050982, "loss": 0.79443109, "num_input_tokens_seen": 303054656, "router_z_loss_mlp": 0.08898926, "routerloss_mlp": 0.0, "step": 3654, "time_per_iteration": 2.8595826625823975 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077013, "balance_loss_mlp": 1.06806064, "diversity_loss_mlp": 0.0, "epoch": 0.7031550596383225, "flos": 629591989248.0, "grad_norm": 0.06456326920806615, "language_loss": 0.8704083, "learning_rate": 0.0002138217449862061, "loss": 0.88117838, "num_input_tokens_seen": 303124256, "router_z_loss_mlp": 0.08959961, "routerloss_mlp": 0.0, "step": 3655, "time_per_iteration": 2.727473258972168 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074047, "balance_loss_mlp": 1.06530333, "diversity_loss_mlp": 0.0, "epoch": 0.703347441323586, "flos": 530843134464.0, "grad_norm": 0.06685907167482581, "language_loss": 0.78296137, "learning_rate": 0.00021356633433615403, "loss": 0.79370177, "num_input_tokens_seen": 303192720, "router_z_loss_mlp": 0.08752441, "routerloss_mlp": 0.0, "step": 3656, "time_per_iteration": 2.5853357315063477 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072471, "balance_loss_mlp": 1.06341755, "diversity_loss_mlp": 0.0, "epoch": 0.7035398230088495, "flos": 693593528832.0, "grad_norm": 0.05195711031116695, "language_loss": 0.83568424, "learning_rate": 0.0002133110348895133, "loss": 0.84640896, "num_input_tokens_seen": 303275968, "router_z_loss_mlp": 0.09063721, "routerloss_mlp": 0.0, "step": 3657, "time_per_iteration": 2.966989517211914 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069658, "balance_loss_mlp": 1.06044364, "diversity_loss_mlp": 0.0, "epoch": 0.7037322046941131, "flos": 968035152384.0, "grad_norm": 0.05842315057280589, "language_loss": 0.85166538, "learning_rate": 0.0002130558467453999, "loss": 0.86236197, "num_input_tokens_seen": 303367296, "router_z_loss_mlp": 0.09204102, "routerloss_mlp": 0.0, "step": 3658, "time_per_iteration": 3.3303468227386475 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01080025, "balance_loss_mlp": 1.07069683, "diversity_loss_mlp": 0.0, "epoch": 0.7039245863793767, "flos": 502863427584.0, "grad_norm": 0.06729984707772495, "language_loss": 0.8469972, "learning_rate": 0.0002128007700028865, "loss": 0.85779744, "num_input_tokens_seen": 303442768, "router_z_loss_mlp": 0.09326172, "routerloss_mlp": 0.0, "step": 3659, "time_per_iteration": 2.7004916667938232 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069483, "balance_loss_mlp": 1.06041121, "diversity_loss_mlp": 0.0, "epoch": 0.7041169680646402, "flos": 465954342912.0, "grad_norm": 0.08608403684795747, "language_loss": 0.84587854, "learning_rate": 0.00021254580476100276, "loss": 0.85657346, "num_input_tokens_seen": 303508304, "router_z_loss_mlp": 0.09075928, "routerloss_mlp": 0.0, "step": 3660, "time_per_iteration": 2.5480196475982666 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072222, "balance_loss_mlp": 1.06278646, "diversity_loss_mlp": 0.0, "epoch": 0.7043093497499038, "flos": 632181417984.0, "grad_norm": 0.07339918095130941, "language_loss": 0.79315257, "learning_rate": 0.00021229095111873497, "loss": 0.80387473, "num_input_tokens_seen": 303579312, "router_z_loss_mlp": 0.09417725, "routerloss_mlp": 0.0, "step": 3661, "time_per_iteration": 2.7757935523986816 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00791151, "balance_loss_mlp": 1.34026599, "diversity_loss_mlp": 0.21938899, "epoch": 0.7045017314351674, "flos": 542930190336.0, "grad_norm": 0.027590424390171175, "language_loss": 0.85883224, "learning_rate": 0.0002120362091750261, "loss": 0.8667438, "num_input_tokens_seen": 303658384, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01132388, "step": 3662, "time_per_iteration": 2.896202802658081 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00798199, "balance_loss_mlp": 1.35343075, "diversity_loss_mlp": 0.22044487, "epoch": 0.704694113120431, "flos": 428237300736.0, "grad_norm": 0.03684811642709949, "language_loss": 0.87121612, "learning_rate": 0.00021178157902877566, "loss": 0.87919807, "num_input_tokens_seen": 303721136, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01126087, "step": 3663, "time_per_iteration": 2.4897618293762207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059718, "balance_loss_mlp": 1.05026472, "diversity_loss_mlp": 0.0, "epoch": 0.7048864948056945, "flos": 650544556032.0, "grad_norm": 0.06585144557964606, "language_loss": 0.868586, "learning_rate": 0.0002115270607788397, "loss": 0.87918323, "num_input_tokens_seen": 303792368, "router_z_loss_mlp": 0.09442139, "routerloss_mlp": 0.0, "step": 3664, "time_per_iteration": 2.767237901687622 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061738, "balance_loss_mlp": 1.05233264, "diversity_loss_mlp": 0.0, "epoch": 0.705078876490958, "flos": 412562336256.0, "grad_norm": 0.06809628156665722, "language_loss": 0.8563199, "learning_rate": 0.00021127265452403133, "loss": 0.86693728, "num_input_tokens_seen": 303856336, "router_z_loss_mlp": 0.09399414, "routerloss_mlp": 0.0, "step": 3665, "time_per_iteration": 2.5270590782165527 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01028622, "balance_loss_mlp": 1.02266109, "diversity_loss_mlp": 0.0, "epoch": 0.7052712581762216, "flos": 1420040927232.0, "grad_norm": 0.030216242564882093, "language_loss": 0.84091628, "learning_rate": 0.0002110183603631199, "loss": 0.85120249, "num_input_tokens_seen": 304089856, "router_z_loss_mlp": 0.05957031, "routerloss_mlp": 0.0, "step": 3666, "time_per_iteration": 4.850507974624634 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105764, "balance_loss_mlp": 1.04785872, "diversity_loss_mlp": 0.0, "epoch": 0.7054636398614852, "flos": 493049369088.0, "grad_norm": 0.07688296901308685, "language_loss": 0.82549417, "learning_rate": 0.00021076417839483065, "loss": 0.83607054, "num_input_tokens_seen": 304164752, "router_z_loss_mlp": 0.09777832, "routerloss_mlp": 0.0, "step": 3667, "time_per_iteration": 2.789318799972534 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00785288, "balance_loss_mlp": 1.32734215, "diversity_loss_mlp": 0.21942863, "epoch": 0.7056560215467488, "flos": 450457417728.0, "grad_norm": 0.027872662040783723, "language_loss": 0.85229611, "learning_rate": 0.00021051010871784589, "loss": 0.86014903, "num_input_tokens_seen": 304229568, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01190263, "step": 3668, "time_per_iteration": 2.6029293537139893 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049431, "balance_loss_mlp": 1.03972173, "diversity_loss_mlp": 0.0, "epoch": 0.7058484032320124, "flos": 565703875584.0, "grad_norm": 0.06094440535163373, "language_loss": 0.79136097, "learning_rate": 0.0002102561514308045, "loss": 0.80185533, "num_input_tokens_seen": 304299408, "router_z_loss_mlp": 0.09698486, "routerloss_mlp": 0.0, "step": 3669, "time_per_iteration": 2.717550754547119 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048509, "balance_loss_mlp": 1.03882289, "diversity_loss_mlp": 0.0, "epoch": 0.7060407849172758, "flos": 567008501760.0, "grad_norm": 0.06685679205809081, "language_loss": 0.82684934, "learning_rate": 0.00021000230663230135, "loss": 0.83733451, "num_input_tokens_seen": 304367936, "router_z_loss_mlp": 0.09680176, "routerloss_mlp": 0.0, "step": 3670, "time_per_iteration": 2.663641929626465 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047272, "balance_loss_mlp": 1.03758621, "diversity_loss_mlp": 0.0, "epoch": 0.7062331666025394, "flos": 468746403840.0, "grad_norm": 0.0788999580683501, "language_loss": 0.8333686, "learning_rate": 0.00020974857442088762, "loss": 0.84384131, "num_input_tokens_seen": 304438368, "router_z_loss_mlp": 0.09686279, "routerloss_mlp": 0.0, "step": 3671, "time_per_iteration": 2.603200674057007 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050365, "balance_loss_mlp": 1.04090595, "diversity_loss_mlp": 0.0, "epoch": 0.706425548287803, "flos": 595316749824.0, "grad_norm": 0.06597055707746856, "language_loss": 0.89200228, "learning_rate": 0.00020949495489507104, "loss": 0.90250599, "num_input_tokens_seen": 304508720, "router_z_loss_mlp": 0.09454346, "routerloss_mlp": 0.0, "step": 3672, "time_per_iteration": 2.6877996921539307 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052217, "balance_loss_mlp": 1.04270363, "diversity_loss_mlp": 0.0, "epoch": 0.7066179299730666, "flos": 475815389184.0, "grad_norm": 0.17274894008002345, "language_loss": 0.84991109, "learning_rate": 0.00020924144815331525, "loss": 0.86043334, "num_input_tokens_seen": 304576128, "router_z_loss_mlp": 0.09521484, "routerloss_mlp": 0.0, "step": 3673, "time_per_iteration": 2.5844242572784424 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054803, "balance_loss_mlp": 1.04517114, "diversity_loss_mlp": 0.0, "epoch": 0.7068103116583301, "flos": 506409117696.0, "grad_norm": 0.0640379080300773, "language_loss": 0.83600396, "learning_rate": 0.00020898805429404044, "loss": 0.84655201, "num_input_tokens_seen": 304642416, "router_z_loss_mlp": 0.09625244, "routerloss_mlp": 0.0, "step": 3674, "time_per_iteration": 2.676417350769043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056838, "balance_loss_mlp": 1.04724169, "diversity_loss_mlp": 0.0, "epoch": 0.7070026933435937, "flos": 679336989696.0, "grad_norm": 0.0780577693768427, "language_loss": 0.78793156, "learning_rate": 0.0002087347734156228, "loss": 0.79849994, "num_input_tokens_seen": 304719312, "router_z_loss_mlp": 0.09594727, "routerloss_mlp": 0.0, "step": 3675, "time_per_iteration": 2.8697783946990967 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057522, "balance_loss_mlp": 1.04800272, "diversity_loss_mlp": 0.0, "epoch": 0.7071950750288573, "flos": 472217942016.0, "grad_norm": 0.0710988084964876, "language_loss": 0.79834986, "learning_rate": 0.00020848160561639452, "loss": 0.80892509, "num_input_tokens_seen": 304789296, "router_z_loss_mlp": 0.09515381, "routerloss_mlp": 0.0, "step": 3676, "time_per_iteration": 2.7413785457611084 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106069, "balance_loss_mlp": 1.05147529, "diversity_loss_mlp": 0.0, "epoch": 0.7073874567141208, "flos": 473742452736.0, "grad_norm": 0.06834186778178446, "language_loss": 0.86040401, "learning_rate": 0.0002082285509946445, "loss": 0.8710109, "num_input_tokens_seen": 304854320, "router_z_loss_mlp": 0.09210205, "routerloss_mlp": 0.0, "step": 3677, "time_per_iteration": 2.5471127033233643 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063838, "balance_loss_mlp": 1.05436051, "diversity_loss_mlp": 0.0, "epoch": 0.7075798383993844, "flos": 545877895680.0, "grad_norm": 0.06236421972787801, "language_loss": 0.83409554, "learning_rate": 0.00020797560964861683, "loss": 0.84473389, "num_input_tokens_seen": 304932784, "router_z_loss_mlp": 0.09472656, "routerloss_mlp": 0.0, "step": 3678, "time_per_iteration": 2.748696804046631 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065573, "balance_loss_mlp": 1.05635202, "diversity_loss_mlp": 0.0, "epoch": 0.7077722200846479, "flos": 662090526720.0, "grad_norm": 0.07878907365407993, "language_loss": 0.80641901, "learning_rate": 0.0002077227816765122, "loss": 0.81707478, "num_input_tokens_seen": 305018080, "router_z_loss_mlp": 0.09216309, "routerloss_mlp": 0.0, "step": 3679, "time_per_iteration": 3.000666618347168 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01036266, "balance_loss_mlp": 1.03114033, "diversity_loss_mlp": 0.0, "epoch": 0.7079646017699115, "flos": 1529960223744.0, "grad_norm": 0.025842314854182848, "language_loss": 0.76447725, "learning_rate": 0.0002074700671764869, "loss": 0.77483988, "num_input_tokens_seen": 305241216, "router_z_loss_mlp": 0.05126953, "routerloss_mlp": 0.0, "step": 3680, "time_per_iteration": 4.779016971588135 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106697, "balance_loss_mlp": 1.05772507, "diversity_loss_mlp": 0.0, "epoch": 0.7081569834551751, "flos": 621502502400.0, "grad_norm": 0.06703239561102693, "language_loss": 0.78754878, "learning_rate": 0.00020721746624665383, "loss": 0.79821849, "num_input_tokens_seen": 305311376, "router_z_loss_mlp": 0.09240723, "routerloss_mlp": 0.0, "step": 3681, "time_per_iteration": 2.7041916847229004 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073317, "balance_loss_mlp": 1.06381631, "diversity_loss_mlp": 0.0, "epoch": 0.7083493651404387, "flos": 794630435328.0, "grad_norm": 0.06071055961479113, "language_loss": 0.80160034, "learning_rate": 0.00020696497898508114, "loss": 0.81233358, "num_input_tokens_seen": 305392736, "router_z_loss_mlp": 0.09503174, "routerloss_mlp": 0.0, "step": 3682, "time_per_iteration": 3.003126382827759 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073203, "balance_loss_mlp": 1.06374955, "diversity_loss_mlp": 0.0, "epoch": 0.7085417468257021, "flos": 813747202560.0, "grad_norm": 0.0794178936209596, "language_loss": 0.77425051, "learning_rate": 0.00020671260548979316, "loss": 0.7849825, "num_input_tokens_seen": 305470896, "router_z_loss_mlp": 0.09442139, "routerloss_mlp": 0.0, "step": 3683, "time_per_iteration": 3.000619649887085 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01079652, "balance_loss_mlp": 1.07019854, "diversity_loss_mlp": 0.0, "epoch": 0.7087341285109657, "flos": 700566340608.0, "grad_norm": 0.06569012319146904, "language_loss": 0.85012448, "learning_rate": 0.00020646034585876982, "loss": 0.86092097, "num_input_tokens_seen": 305547072, "router_z_loss_mlp": 0.09442139, "routerloss_mlp": 0.0, "step": 3684, "time_per_iteration": 2.8407599925994873 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00788495, "balance_loss_mlp": 1.33244729, "diversity_loss_mlp": 0.22155851, "epoch": 0.7089265101962293, "flos": 596514917376.0, "grad_norm": 0.02817752508262258, "language_loss": 0.84630954, "learning_rate": 0.00020620820018994718, "loss": 0.8541944, "num_input_tokens_seen": 305624512, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0114923, "step": 3685, "time_per_iteration": 2.8807289600372314 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00791818, "balance_loss_mlp": 1.33957911, "diversity_loss_mlp": 0.22135019, "epoch": 0.7091188918814929, "flos": 487106970624.0, "grad_norm": 0.03572846620936607, "language_loss": 0.83307725, "learning_rate": 0.00020595616858121675, "loss": 0.84099543, "num_input_tokens_seen": 305695088, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0113536, "step": 3686, "time_per_iteration": 2.7336056232452393 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01075035, "balance_loss_mlp": 1.06569517, "diversity_loss_mlp": 0.0, "epoch": 0.7093112735667565, "flos": 600117507072.0, "grad_norm": 0.05825520117041851, "language_loss": 0.80985916, "learning_rate": 0.00020570425113042586, "loss": 0.82060945, "num_input_tokens_seen": 305763680, "router_z_loss_mlp": 0.09338379, "routerloss_mlp": 0.0, "step": 3687, "time_per_iteration": 2.724151611328125 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01078198, "balance_loss_mlp": 1.06894779, "diversity_loss_mlp": 0.0, "epoch": 0.70950365525202, "flos": 505830956544.0, "grad_norm": 0.0736963808397267, "language_loss": 0.8558749, "learning_rate": 0.0002054524479353776, "loss": 0.8666569, "num_input_tokens_seen": 305835008, "router_z_loss_mlp": 0.09240723, "routerloss_mlp": 0.0, "step": 3688, "time_per_iteration": 2.7505970001220703 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074288, "balance_loss_mlp": 1.06498957, "diversity_loss_mlp": 0.0, "epoch": 0.7096960369372836, "flos": 732160747008.0, "grad_norm": 0.07506666957013575, "language_loss": 0.81571054, "learning_rate": 0.00020520075909383063, "loss": 0.82645345, "num_input_tokens_seen": 305909072, "router_z_loss_mlp": 0.09295654, "routerloss_mlp": 0.0, "step": 3689, "time_per_iteration": 2.854198694229126 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074645, "balance_loss_mlp": 1.06511474, "diversity_loss_mlp": 0.0, "epoch": 0.7098884186225471, "flos": 972077511168.0, "grad_norm": 0.06551416788386397, "language_loss": 0.80860078, "learning_rate": 0.00020494918470349916, "loss": 0.81934714, "num_input_tokens_seen": 305994752, "router_z_loss_mlp": 0.09521484, "routerloss_mlp": 0.0, "step": 3690, "time_per_iteration": 3.2713325023651123 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0079528, "balance_loss_mlp": 1.34716058, "diversity_loss_mlp": 0.22097552, "epoch": 0.7100808003078107, "flos": 504252117504.0, "grad_norm": 0.03587666052644611, "language_loss": 0.85333264, "learning_rate": 0.00020469772486205297, "loss": 0.86128545, "num_input_tokens_seen": 306062960, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01121199, "step": 3691, "time_per_iteration": 2.626685380935669 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00787595, "balance_loss_mlp": 1.33183146, "diversity_loss_mlp": 0.22060202, "epoch": 0.7102731819930742, "flos": 540335992320.0, "grad_norm": 0.030476334667887343, "language_loss": 0.81455922, "learning_rate": 0.0002044463796671177, "loss": 0.82243514, "num_input_tokens_seen": 306134224, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0113784, "step": 3692, "time_per_iteration": 2.7819416522979736 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074018, "balance_loss_mlp": 1.06465387, "diversity_loss_mlp": 0.0, "epoch": 0.7104655636783378, "flos": 620378113536.0, "grad_norm": 0.07963770038273417, "language_loss": 0.8046093, "learning_rate": 0.00020419514921627408, "loss": 0.81534946, "num_input_tokens_seen": 306214512, "router_z_loss_mlp": 0.09362793, "routerloss_mlp": 0.0, "step": 3693, "time_per_iteration": 2.8676981925964355 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069877, "balance_loss_mlp": 1.06088233, "diversity_loss_mlp": 0.0, "epoch": 0.7106579453636014, "flos": 557322923520.0, "grad_norm": 0.07391756130926609, "language_loss": 0.77261078, "learning_rate": 0.00020394403360705855, "loss": 0.78330958, "num_input_tokens_seen": 306283232, "router_z_loss_mlp": 0.08996582, "routerloss_mlp": 0.0, "step": 3694, "time_per_iteration": 2.695068359375 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00788663, "balance_loss_mlp": 1.33321095, "diversity_loss_mlp": 0.22100018, "epoch": 0.710850327048865, "flos": 513048245760.0, "grad_norm": 0.034812211167962216, "language_loss": 0.88271379, "learning_rate": 0.00020369303293696228, "loss": 0.89060044, "num_input_tokens_seen": 306351536, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01155703, "step": 3695, "time_per_iteration": 2.601621627807617 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066517, "balance_loss_mlp": 1.05723643, "diversity_loss_mlp": 0.0, "epoch": 0.7110427087341286, "flos": 423619352064.0, "grad_norm": 0.07715335648803619, "language_loss": 0.78224587, "learning_rate": 0.00020344214730343304, "loss": 0.79291105, "num_input_tokens_seen": 306419040, "router_z_loss_mlp": 0.09283447, "routerloss_mlp": 0.0, "step": 3696, "time_per_iteration": 2.6193599700927734 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065299, "balance_loss_mlp": 1.05618572, "diversity_loss_mlp": 0.0, "epoch": 0.711235090419392, "flos": 577415402496.0, "grad_norm": 0.05468894944159508, "language_loss": 0.79277122, "learning_rate": 0.00020319137680387296, "loss": 0.80342424, "num_input_tokens_seen": 306503248, "router_z_loss_mlp": 0.09118652, "routerloss_mlp": 0.0, "step": 3697, "time_per_iteration": 2.9309933185577393 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060045, "balance_loss_mlp": 1.05068743, "diversity_loss_mlp": 0.0, "epoch": 0.7114274721046556, "flos": 448060709376.0, "grad_norm": 0.07057759031394817, "language_loss": 0.80451727, "learning_rate": 0.0002029407215356398, "loss": 0.81511772, "num_input_tokens_seen": 306566288, "router_z_loss_mlp": 0.09356689, "routerloss_mlp": 0.0, "step": 3698, "time_per_iteration": 2.4956727027893066 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058583, "balance_loss_mlp": 1.04976714, "diversity_loss_mlp": 0.0, "epoch": 0.7116198537899192, "flos": 621962095104.0, "grad_norm": 0.0722387573875999, "language_loss": 0.83844793, "learning_rate": 0.00020269018159604663, "loss": 0.84903371, "num_input_tokens_seen": 306633344, "router_z_loss_mlp": 0.08825684, "routerloss_mlp": 0.0, "step": 3699, "time_per_iteration": 2.731231689453125 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057429, "balance_loss_mlp": 1.04814827, "diversity_loss_mlp": 0.0, "epoch": 0.7118122354751828, "flos": 498724895232.0, "grad_norm": 0.07123396580800914, "language_loss": 0.818003, "learning_rate": 0.00020243975708236162, "loss": 0.82857728, "num_input_tokens_seen": 306701328, "router_z_loss_mlp": 0.09283447, "routerloss_mlp": 0.0, "step": 3700, "time_per_iteration": 2.597215414047241 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00781944, "balance_loss_mlp": 1.31673443, "diversity_loss_mlp": 0.22274226, "epoch": 0.7120046171604463, "flos": 572718532608.0, "grad_norm": 0.030217464674653638, "language_loss": 0.86634398, "learning_rate": 0.00020218944809180818, "loss": 0.87416339, "num_input_tokens_seen": 306773168, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01220552, "step": 3701, "time_per_iteration": 2.7128944396972656 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056546, "balance_loss_mlp": 1.04739642, "diversity_loss_mlp": 0.0, "epoch": 0.7121969988457099, "flos": 572664204288.0, "grad_norm": 0.06969302254489844, "language_loss": 0.84630072, "learning_rate": 0.00020193925472156493, "loss": 0.85686618, "num_input_tokens_seen": 306845312, "router_z_loss_mlp": 0.09155273, "routerloss_mlp": 0.0, "step": 3702, "time_per_iteration": 2.695040702819824 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01009738, "balance_loss_mlp": 1.00442076, "diversity_loss_mlp": 0.0, "epoch": 0.7123893805309734, "flos": 1523429752320.0, "grad_norm": 0.015177951683804305, "language_loss": 0.74289167, "learning_rate": 0.00020168917706876537, "loss": 0.75298905, "num_input_tokens_seen": 307079216, "router_z_loss_mlp": 0.05322266, "routerloss_mlp": 0.0, "step": 3703, "time_per_iteration": 4.91239857673645 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00784779, "balance_loss_mlp": 1.3239193, "diversity_loss_mlp": 0.22157452, "epoch": 0.712581762216237, "flos": 615105280512.0, "grad_norm": 0.02622509859947044, "language_loss": 0.83696187, "learning_rate": 0.00020143921523049863, "loss": 0.84480959, "num_input_tokens_seen": 307163568, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01203172, "step": 3704, "time_per_iteration": 3.0262062549591064 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057244, "balance_loss_mlp": 1.04805851, "diversity_loss_mlp": 0.0, "epoch": 0.7127741439015006, "flos": 597777698304.0, "grad_norm": 0.07737525798134272, "language_loss": 0.838422, "learning_rate": 0.00020118936930380837, "loss": 0.84899437, "num_input_tokens_seen": 307232800, "router_z_loss_mlp": 0.09179688, "routerloss_mlp": 0.0, "step": 3705, "time_per_iteration": 2.741217851638794 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105596, "balance_loss_mlp": 1.04639971, "diversity_loss_mlp": 0.0, "epoch": 0.7129665255867641, "flos": 537398198784.0, "grad_norm": 0.08146435226617602, "language_loss": 0.80879092, "learning_rate": 0.0002009396393856932, "loss": 0.81935048, "num_input_tokens_seen": 307307216, "router_z_loss_mlp": 0.09552002, "routerloss_mlp": 0.0, "step": 3706, "time_per_iteration": 2.643540143966675 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050376, "balance_loss_mlp": 1.0414114, "diversity_loss_mlp": 0.0, "epoch": 0.7131589072720277, "flos": 526442499072.0, "grad_norm": 0.07418360122955521, "language_loss": 0.82790005, "learning_rate": 0.00020069002557310673, "loss": 0.83840382, "num_input_tokens_seen": 307377472, "router_z_loss_mlp": 0.08966064, "routerloss_mlp": 0.0, "step": 3707, "time_per_iteration": 2.719648838043213 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052452, "balance_loss_mlp": 1.04351699, "diversity_loss_mlp": 0.0, "epoch": 0.7133512889572913, "flos": 530919484416.0, "grad_norm": 0.05884856391484217, "language_loss": 0.77115107, "learning_rate": 0.00020044052796295807, "loss": 0.78167558, "num_input_tokens_seen": 307456880, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 3708, "time_per_iteration": 2.830353260040283 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051246, "balance_loss_mlp": 1.04202533, "diversity_loss_mlp": 0.0, "epoch": 0.7135436706425549, "flos": 503535564288.0, "grad_norm": 0.07889939453961878, "language_loss": 0.82217181, "learning_rate": 0.00020019114665211063, "loss": 0.83268428, "num_input_tokens_seen": 307524784, "router_z_loss_mlp": 0.09216309, "routerloss_mlp": 0.0, "step": 3709, "time_per_iteration": 2.581709623336792 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048706, "balance_loss_mlp": 1.03982449, "diversity_loss_mlp": 0.0, "epoch": 0.7137360523278183, "flos": 515968786944.0, "grad_norm": 0.06519405348344502, "language_loss": 0.81405282, "learning_rate": 0.00019994188173738276, "loss": 0.8245399, "num_input_tokens_seen": 307591408, "router_z_loss_mlp": 0.08892822, "routerloss_mlp": 0.0, "step": 3710, "time_per_iteration": 2.5735976696014404 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049854, "balance_loss_mlp": 1.04063272, "diversity_loss_mlp": 0.0, "epoch": 0.7139284340130819, "flos": 510389434368.0, "grad_norm": 0.07046885330875076, "language_loss": 0.80712581, "learning_rate": 0.0001996927333155477, "loss": 0.81762433, "num_input_tokens_seen": 307662912, "router_z_loss_mlp": 0.09222412, "routerloss_mlp": 0.0, "step": 3711, "time_per_iteration": 2.814368724822998 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054586, "balance_loss_mlp": 1.04546654, "diversity_loss_mlp": 0.0, "epoch": 0.7141208156983455, "flos": 890275940352.0, "grad_norm": 0.07187972004168419, "language_loss": 0.85349059, "learning_rate": 0.00019944370148333346, "loss": 0.8640365, "num_input_tokens_seen": 307752256, "router_z_loss_mlp": 0.09124756, "routerloss_mlp": 0.0, "step": 3712, "time_per_iteration": 3.169759750366211 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058298, "balance_loss_mlp": 1.04938745, "diversity_loss_mlp": 0.0, "epoch": 0.7143131973836091, "flos": 535779712512.0, "grad_norm": 0.060002667598624965, "language_loss": 0.79623508, "learning_rate": 0.00019919478633742278, "loss": 0.80681807, "num_input_tokens_seen": 307821504, "router_z_loss_mlp": 0.08917236, "routerloss_mlp": 0.0, "step": 3713, "time_per_iteration": 2.644663095474243 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061749, "balance_loss_mlp": 1.05258763, "diversity_loss_mlp": 0.0, "epoch": 0.7145055790688727, "flos": 473668300800.0, "grad_norm": 0.07397385813864758, "language_loss": 0.85182703, "learning_rate": 0.00019894598797445302, "loss": 0.86244452, "num_input_tokens_seen": 307886464, "router_z_loss_mlp": 0.09155273, "routerloss_mlp": 0.0, "step": 3714, "time_per_iteration": 2.5240604877471924 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061709, "balance_loss_mlp": 1.05239308, "diversity_loss_mlp": 0.0, "epoch": 0.7146979607541362, "flos": 570521885184.0, "grad_norm": 0.07339492646897193, "language_loss": 0.81885231, "learning_rate": 0.00019869730649101615, "loss": 0.82946944, "num_input_tokens_seen": 307962736, "router_z_loss_mlp": 0.09320068, "routerloss_mlp": 0.0, "step": 3715, "time_per_iteration": 2.827868938446045 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063135, "balance_loss_mlp": 1.05403948, "diversity_loss_mlp": 0.0, "epoch": 0.7148903424393998, "flos": 839666082816.0, "grad_norm": 0.0742719443850205, "language_loss": 0.72613627, "learning_rate": 0.00019844874198365943, "loss": 0.73676765, "num_input_tokens_seen": 308046592, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 3716, "time_per_iteration": 3.0963878631591797 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063516, "balance_loss_mlp": 1.05428362, "diversity_loss_mlp": 0.0, "epoch": 0.7150827241246633, "flos": 541823427072.0, "grad_norm": 0.061591749317610134, "language_loss": 0.83976817, "learning_rate": 0.00019820029454888362, "loss": 0.85040331, "num_input_tokens_seen": 308119920, "router_z_loss_mlp": 0.09228516, "routerloss_mlp": 0.0, "step": 3717, "time_per_iteration": 2.7068889141082764 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01006732, "balance_loss_mlp": 1.0012722, "diversity_loss_mlp": 0.0, "epoch": 0.7152751058099269, "flos": 1583678200320.0, "grad_norm": 0.016486733546314403, "language_loss": 0.74521267, "learning_rate": 0.00019795196428314455, "loss": 0.75528002, "num_input_tokens_seen": 308361024, "router_z_loss_mlp": 0.0546875, "routerloss_mlp": 0.0, "step": 3718, "time_per_iteration": 5.0301513671875 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010681, "balance_loss_mlp": 1.05873013, "diversity_loss_mlp": 0.0, "epoch": 0.7154674874951905, "flos": 517419145728.0, "grad_norm": 0.06632920905024949, "language_loss": 0.80107152, "learning_rate": 0.0001977037512828529, "loss": 0.81175244, "num_input_tokens_seen": 308429808, "router_z_loss_mlp": 0.09362793, "routerloss_mlp": 0.0, "step": 3719, "time_per_iteration": 2.573982000350952 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066134, "balance_loss_mlp": 1.05686522, "diversity_loss_mlp": 0.0, "epoch": 0.715659869180454, "flos": 602524127232.0, "grad_norm": 0.05986593090344285, "language_loss": 0.86432415, "learning_rate": 0.0001974556556443734, "loss": 0.87498546, "num_input_tokens_seen": 308501888, "router_z_loss_mlp": 0.09265137, "routerloss_mlp": 0.0, "step": 3720, "time_per_iteration": 2.7087209224700928 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106825, "balance_loss_mlp": 1.0589757, "diversity_loss_mlp": 0.0, "epoch": 0.7158522508657176, "flos": 531675684864.0, "grad_norm": 0.05551674827732864, "language_loss": 0.88590324, "learning_rate": 0.00019720767746402547, "loss": 0.89658576, "num_input_tokens_seen": 308576368, "router_z_loss_mlp": 0.0927124, "routerloss_mlp": 0.0, "step": 3721, "time_per_iteration": 2.7290821075439453 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010754, "balance_loss_mlp": 1.06610191, "diversity_loss_mlp": 0.0, "epoch": 0.7160446325509812, "flos": 557569972224.0, "grad_norm": 0.07406216566818759, "language_loss": 0.79965603, "learning_rate": 0.00019695981683808222, "loss": 0.81041002, "num_input_tokens_seen": 308651936, "router_z_loss_mlp": 0.09295654, "routerloss_mlp": 0.0, "step": 3722, "time_per_iteration": 2.8323793411254883 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072903, "balance_loss_mlp": 1.06386733, "diversity_loss_mlp": 0.0, "epoch": 0.7162370142362448, "flos": 690986847744.0, "grad_norm": 0.08922707402242334, "language_loss": 0.84955275, "learning_rate": 0.00019671207386277225, "loss": 0.86028177, "num_input_tokens_seen": 308737264, "router_z_loss_mlp": 0.09051514, "routerloss_mlp": 0.0, "step": 3723, "time_per_iteration": 2.94681978225708 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069475, "balance_loss_mlp": 1.06010544, "diversity_loss_mlp": 0.0, "epoch": 0.7164293959215082, "flos": 794109173760.0, "grad_norm": 0.07420263460977167, "language_loss": 0.78355432, "learning_rate": 0.0001964644486342777, "loss": 0.79424912, "num_input_tokens_seen": 308811776, "router_z_loss_mlp": 0.09362793, "routerloss_mlp": 0.0, "step": 3724, "time_per_iteration": 2.960944414138794 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064733, "balance_loss_mlp": 1.05573297, "diversity_loss_mlp": 0.0, "epoch": 0.7166217776067718, "flos": 494178527232.0, "grad_norm": 0.0760825236490028, "language_loss": 0.86588323, "learning_rate": 0.00019621694124873524, "loss": 0.87653053, "num_input_tokens_seen": 308886704, "router_z_loss_mlp": 0.09014893, "routerloss_mlp": 0.0, "step": 3725, "time_per_iteration": 2.6881937980651855 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0101766, "balance_loss_mlp": 1.01224804, "diversity_loss_mlp": 0.0, "epoch": 0.7168141592920354, "flos": 1401060354048.0, "grad_norm": 0.018433056607108506, "language_loss": 0.76540077, "learning_rate": 0.00019596955180223557, "loss": 0.77557743, "num_input_tokens_seen": 309113456, "router_z_loss_mlp": 0.05419922, "routerloss_mlp": 0.0, "step": 3726, "time_per_iteration": 4.8842387199401855 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057658, "balance_loss_mlp": 1.04820442, "diversity_loss_mlp": 0.0, "epoch": 0.717006540977299, "flos": 793150341120.0, "grad_norm": 0.08148717312552407, "language_loss": 0.77167314, "learning_rate": 0.00019572228039082428, "loss": 0.78224969, "num_input_tokens_seen": 309198768, "router_z_loss_mlp": 0.09436035, "routerloss_mlp": 0.0, "step": 3727, "time_per_iteration": 3.071643829345703 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055911, "balance_loss_mlp": 1.04670763, "diversity_loss_mlp": 0.0, "epoch": 0.7171989226625626, "flos": 554812416000.0, "grad_norm": 0.05270267691232831, "language_loss": 0.83482945, "learning_rate": 0.0001954751271105002, "loss": 0.84538865, "num_input_tokens_seen": 309279680, "router_z_loss_mlp": 0.09204102, "routerloss_mlp": 0.0, "step": 3728, "time_per_iteration": 2.8301711082458496 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105429, "balance_loss_mlp": 1.04496169, "diversity_loss_mlp": 0.0, "epoch": 0.717391304347826, "flos": 555914409984.0, "grad_norm": 0.06896440922655821, "language_loss": 0.80838037, "learning_rate": 0.00019522809205721687, "loss": 0.81892335, "num_input_tokens_seen": 309359152, "router_z_loss_mlp": 0.09326172, "routerloss_mlp": 0.0, "step": 3729, "time_per_iteration": 2.8094747066497803 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048553, "balance_loss_mlp": 1.03930831, "diversity_loss_mlp": 0.0, "epoch": 0.7175836860330896, "flos": 538855898112.0, "grad_norm": 0.09744205035272979, "language_loss": 0.83110106, "learning_rate": 0.0001949811753268816, "loss": 0.84158659, "num_input_tokens_seen": 309432800, "router_z_loss_mlp": 0.09240723, "routerloss_mlp": 0.0, "step": 3730, "time_per_iteration": 2.6963374614715576 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045755, "balance_loss_mlp": 1.03643274, "diversity_loss_mlp": 0.0, "epoch": 0.7177760677183532, "flos": 515637674496.0, "grad_norm": 0.0730125544637403, "language_loss": 0.82630277, "learning_rate": 0.00019473437701535634, "loss": 0.83676028, "num_input_tokens_seen": 309499456, "router_z_loss_mlp": 0.09320068, "routerloss_mlp": 0.0, "step": 3731, "time_per_iteration": 2.6076574325561523 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047574, "balance_loss_mlp": 1.03844213, "diversity_loss_mlp": 0.0, "epoch": 0.7179684494036168, "flos": 674719041024.0, "grad_norm": 0.07914181118847867, "language_loss": 0.89615285, "learning_rate": 0.00019448769721845677, "loss": 0.90662855, "num_input_tokens_seen": 309571056, "router_z_loss_mlp": 0.09143066, "routerloss_mlp": 0.0, "step": 3732, "time_per_iteration": 2.824897289276123 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047996, "balance_loss_mlp": 1.03853655, "diversity_loss_mlp": 0.0, "epoch": 0.7181608310888803, "flos": 469912637952.0, "grad_norm": 0.07061643018013358, "language_loss": 0.86148334, "learning_rate": 0.00019424113603195203, "loss": 0.87196326, "num_input_tokens_seen": 309635040, "router_z_loss_mlp": 0.09454346, "routerloss_mlp": 0.0, "step": 3733, "time_per_iteration": 2.520390510559082 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104846, "balance_loss_mlp": 1.03879809, "diversity_loss_mlp": 0.0, "epoch": 0.7183532127741439, "flos": 593952652800.0, "grad_norm": 0.07087799527916698, "language_loss": 0.79863775, "learning_rate": 0.0001939946935515657, "loss": 0.80912238, "num_input_tokens_seen": 309713696, "router_z_loss_mlp": 0.09649658, "routerloss_mlp": 0.0, "step": 3734, "time_per_iteration": 2.8286993503570557 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104904, "balance_loss_mlp": 1.03927684, "diversity_loss_mlp": 0.0, "epoch": 0.7185455944594075, "flos": 498917615616.0, "grad_norm": 0.08245280249652003, "language_loss": 0.80650169, "learning_rate": 0.0001937483698729755, "loss": 0.8169921, "num_input_tokens_seen": 309782864, "router_z_loss_mlp": 0.09759521, "routerloss_mlp": 0.0, "step": 3735, "time_per_iteration": 2.6458795070648193 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043708, "balance_loss_mlp": 1.0338974, "diversity_loss_mlp": 0.0, "epoch": 0.718737976144671, "flos": 814933260288.0, "grad_norm": 0.07515481344769812, "language_loss": 0.82211673, "learning_rate": 0.0001935021650918128, "loss": 0.83255374, "num_input_tokens_seen": 309867056, "router_z_loss_mlp": 0.0980835, "routerloss_mlp": 0.0, "step": 3736, "time_per_iteration": 3.0285887718200684 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043391, "balance_loss_mlp": 1.03346682, "diversity_loss_mlp": 0.0, "epoch": 0.7189303578299346, "flos": 438328143360.0, "grad_norm": 0.06979349456564556, "language_loss": 0.87017608, "learning_rate": 0.0001932560793036625, "loss": 0.88060999, "num_input_tokens_seen": 309929744, "router_z_loss_mlp": 0.09924316, "routerloss_mlp": 0.0, "step": 3737, "time_per_iteration": 2.482374906539917 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044251, "balance_loss_mlp": 1.03452408, "diversity_loss_mlp": 0.0, "epoch": 0.7191227395151981, "flos": 549398992896.0, "grad_norm": 0.08340257337042449, "language_loss": 0.86882925, "learning_rate": 0.00019301011260406382, "loss": 0.87927186, "num_input_tokens_seen": 309998128, "router_z_loss_mlp": 0.09716797, "routerloss_mlp": 0.0, "step": 3738, "time_per_iteration": 2.6162045001983643 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104539, "balance_loss_mlp": 1.03576994, "diversity_loss_mlp": 0.0, "epoch": 0.7193151212004617, "flos": 626938320384.0, "grad_norm": 0.0721539169034284, "language_loss": 0.79805303, "learning_rate": 0.00019276426508850936, "loss": 0.80850697, "num_input_tokens_seen": 310065472, "router_z_loss_mlp": 0.09619141, "routerloss_mlp": 0.0, "step": 3739, "time_per_iteration": 2.7380456924438477 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01041013, "balance_loss_mlp": 1.03111315, "diversity_loss_mlp": 0.0, "epoch": 0.7195075028857253, "flos": 741062960640.0, "grad_norm": 0.0788007665709812, "language_loss": 0.80469853, "learning_rate": 0.00019251853685244564, "loss": 0.81510872, "num_input_tokens_seen": 310152960, "router_z_loss_mlp": 0.09899902, "routerloss_mlp": 0.0, "step": 3740, "time_per_iteration": 3.0559754371643066 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044512, "balance_loss_mlp": 1.03485012, "diversity_loss_mlp": 0.0, "epoch": 0.7196998845709889, "flos": 802875566592.0, "grad_norm": 0.07989753754857366, "language_loss": 0.80738026, "learning_rate": 0.00019227292799127283, "loss": 0.81782538, "num_input_tokens_seen": 310234080, "router_z_loss_mlp": 0.09661865, "routerloss_mlp": 0.0, "step": 3741, "time_per_iteration": 3.0058369636535645 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044827, "balance_loss_mlp": 1.03530192, "diversity_loss_mlp": 0.0, "epoch": 0.7198922662562524, "flos": 925183669248.0, "grad_norm": 0.17846470971826942, "language_loss": 0.79000109, "learning_rate": 0.00019202743860034454, "loss": 0.80044937, "num_input_tokens_seen": 310330208, "router_z_loss_mlp": 0.09521484, "routerloss_mlp": 0.0, "step": 3742, "time_per_iteration": 3.218614339828491 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043684, "balance_loss_mlp": 1.03441513, "diversity_loss_mlp": 0.0, "epoch": 0.7200846479415159, "flos": 580111289856.0, "grad_norm": 0.07729553507192725, "language_loss": 0.83831203, "learning_rate": 0.00019178206877496873, "loss": 0.84874886, "num_input_tokens_seen": 310402960, "router_z_loss_mlp": 0.09265137, "routerloss_mlp": 0.0, "step": 3743, "time_per_iteration": 2.7014403343200684 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048278, "balance_loss_mlp": 1.03885424, "diversity_loss_mlp": 0.0, "epoch": 0.7202770296267795, "flos": 557695881216.0, "grad_norm": 0.06342209640567653, "language_loss": 0.85333169, "learning_rate": 0.0001915368186104059, "loss": 0.86381447, "num_input_tokens_seen": 310479776, "router_z_loss_mlp": 0.09423828, "routerloss_mlp": 0.0, "step": 3744, "time_per_iteration": 2.733520746231079 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105441, "balance_loss_mlp": 1.04513526, "diversity_loss_mlp": 0.0, "epoch": 0.7204694113120431, "flos": 672552129024.0, "grad_norm": 0.08207076889899251, "language_loss": 0.81176144, "learning_rate": 0.0001912916882018706, "loss": 0.8223055, "num_input_tokens_seen": 310555952, "router_z_loss_mlp": 0.09277344, "routerloss_mlp": 0.0, "step": 3745, "time_per_iteration": 2.7833125591278076 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057126, "balance_loss_mlp": 1.04774427, "diversity_loss_mlp": 0.0, "epoch": 0.7206617929973067, "flos": 799194055680.0, "grad_norm": 0.08263651010752651, "language_loss": 0.79468751, "learning_rate": 0.00019104667764453125, "loss": 0.80525875, "num_input_tokens_seen": 310634784, "router_z_loss_mlp": 0.09368896, "routerloss_mlp": 0.0, "step": 3746, "time_per_iteration": 3.0572047233581543 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066676, "balance_loss_mlp": 1.05751503, "diversity_loss_mlp": 0.0, "epoch": 0.7208541746825702, "flos": 531898140672.0, "grad_norm": 0.06554660744507769, "language_loss": 0.80441052, "learning_rate": 0.00019080178703350926, "loss": 0.8150773, "num_input_tokens_seen": 310703216, "router_z_loss_mlp": 0.09161377, "routerloss_mlp": 0.0, "step": 3747, "time_per_iteration": 2.6344070434570312 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067771, "balance_loss_mlp": 1.05819249, "diversity_loss_mlp": 0.0, "epoch": 0.7210465563678338, "flos": 535139882496.0, "grad_norm": 0.07164749029527417, "language_loss": 0.83225226, "learning_rate": 0.00019055701646387952, "loss": 0.84292996, "num_input_tokens_seen": 310776816, "router_z_loss_mlp": 0.09564209, "routerloss_mlp": 0.0, "step": 3748, "time_per_iteration": 2.674436330795288 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01014621, "balance_loss_mlp": 1.00935245, "diversity_loss_mlp": 0.0, "epoch": 0.7212389380530974, "flos": 1533908606976.0, "grad_norm": 0.01350364958452467, "language_loss": 0.80472684, "learning_rate": 0.00019031236603067042, "loss": 0.8148731, "num_input_tokens_seen": 310987056, "router_z_loss_mlp": 0.05273438, "routerloss_mlp": 0.0, "step": 3749, "time_per_iteration": 4.8169167041778564 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01074721, "balance_loss_mlp": 1.06568444, "diversity_loss_mlp": 0.0, "epoch": 0.7214313197383609, "flos": 461511862272.0, "grad_norm": 0.09948968640859872, "language_loss": 0.86443639, "learning_rate": 0.00019006783582886368, "loss": 0.87518358, "num_input_tokens_seen": 311051648, "router_z_loss_mlp": 0.09039307, "routerloss_mlp": 0.0, "step": 3750, "time_per_iteration": 2.6094882488250732 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01082564, "balance_loss_mlp": 1.0731287, "diversity_loss_mlp": 0.0, "epoch": 0.7216237014236244, "flos": 1037134056960.0, "grad_norm": 0.0940617497046545, "language_loss": 0.8313877, "learning_rate": 0.00018982342595339437, "loss": 0.84221339, "num_input_tokens_seen": 311146272, "router_z_loss_mlp": 0.09423828, "routerloss_mlp": 0.0, "step": 3751, "time_per_iteration": 4.834062576293945 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077441, "balance_loss_mlp": 1.06848848, "diversity_loss_mlp": 0.0, "epoch": 0.721816083108888, "flos": 895951466496.0, "grad_norm": 0.08300933032368943, "language_loss": 0.81837034, "learning_rate": 0.00018957913649915076, "loss": 0.82914484, "num_input_tokens_seen": 311223760, "router_z_loss_mlp": 0.08959961, "routerloss_mlp": 0.0, "step": 3752, "time_per_iteration": 3.1204826831817627 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01076559, "balance_loss_mlp": 1.06739748, "diversity_loss_mlp": 0.0, "epoch": 0.7220084647941516, "flos": 523314556416.0, "grad_norm": 0.08305681898579634, "language_loss": 0.79633486, "learning_rate": 0.00018933496756097428, "loss": 0.80710053, "num_input_tokens_seen": 311290336, "router_z_loss_mlp": 0.09161377, "routerloss_mlp": 0.0, "step": 3753, "time_per_iteration": 2.6664350032806396 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01077149, "balance_loss_mlp": 1.06786871, "diversity_loss_mlp": 0.0, "epoch": 0.7222008464794152, "flos": 816099494400.0, "grad_norm": 0.08328010196337048, "language_loss": 0.81679463, "learning_rate": 0.0001890909192336603, "loss": 0.82756615, "num_input_tokens_seen": 311366240, "router_z_loss_mlp": 0.09277344, "routerloss_mlp": 0.0, "step": 3754, "time_per_iteration": 2.994882822036743 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01073126, "balance_loss_mlp": 1.06407857, "diversity_loss_mlp": 0.0, "epoch": 0.7223932281646788, "flos": 749053702656.0, "grad_norm": 0.08777822688547723, "language_loss": 0.70716894, "learning_rate": 0.00018884699161195623, "loss": 0.71790028, "num_input_tokens_seen": 311445184, "router_z_loss_mlp": 0.09051514, "routerloss_mlp": 0.0, "step": 3755, "time_per_iteration": 4.262615442276001 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071208, "balance_loss_mlp": 1.06174874, "diversity_loss_mlp": 0.0, "epoch": 0.7225856098499422, "flos": 745502870016.0, "grad_norm": 0.0673256778775424, "language_loss": 0.77517748, "learning_rate": 0.00018860318479056327, "loss": 0.78588951, "num_input_tokens_seen": 311527280, "router_z_loss_mlp": 0.09460449, "routerloss_mlp": 0.0, "step": 3756, "time_per_iteration": 3.1185147762298584 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064095, "balance_loss_mlp": 1.05514276, "diversity_loss_mlp": 0.0, "epoch": 0.7227779915352058, "flos": 547330825728.0, "grad_norm": 0.06734169026400741, "language_loss": 0.83406973, "learning_rate": 0.00018835949886413555, "loss": 0.84471071, "num_input_tokens_seen": 311601552, "router_z_loss_mlp": 0.08953857, "routerloss_mlp": 0.0, "step": 3757, "time_per_iteration": 2.7693490982055664 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066701, "balance_loss_mlp": 1.05735517, "diversity_loss_mlp": 0.0, "epoch": 0.7229703732204694, "flos": 530484857856.0, "grad_norm": 0.0750419048722912, "language_loss": 0.78459024, "learning_rate": 0.0001881159339272806, "loss": 0.79525727, "num_input_tokens_seen": 311670736, "router_z_loss_mlp": 0.09344482, "routerloss_mlp": 0.0, "step": 3758, "time_per_iteration": 2.6415517330169678 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059793, "balance_loss_mlp": 1.05062032, "diversity_loss_mlp": 0.0, "epoch": 0.723162754905733, "flos": 528355021824.0, "grad_norm": 0.0644798827635335, "language_loss": 0.78601432, "learning_rate": 0.00018787249007455858, "loss": 0.79661226, "num_input_tokens_seen": 311736800, "router_z_loss_mlp": 0.09173584, "routerloss_mlp": 0.0, "step": 3759, "time_per_iteration": 2.6022799015045166 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063242, "balance_loss_mlp": 1.05413401, "diversity_loss_mlp": 0.0, "epoch": 0.7233551365909965, "flos": 654868468224.0, "grad_norm": 0.07015599197769962, "language_loss": 0.71291095, "learning_rate": 0.00018762916740048302, "loss": 0.72354335, "num_input_tokens_seen": 311806064, "router_z_loss_mlp": 0.09106445, "routerloss_mlp": 0.0, "step": 3760, "time_per_iteration": 2.8239991664886475 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059773, "balance_loss_mlp": 1.05033171, "diversity_loss_mlp": 0.0, "epoch": 0.7235475182762601, "flos": 522365635584.0, "grad_norm": 0.07068719643677601, "language_loss": 0.86275655, "learning_rate": 0.0001873859659995195, "loss": 0.87335426, "num_input_tokens_seen": 311881280, "router_z_loss_mlp": 0.09448242, "routerloss_mlp": 0.0, "step": 3761, "time_per_iteration": 2.825853109359741 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056903, "balance_loss_mlp": 1.04742599, "diversity_loss_mlp": 0.0, "epoch": 0.7237398999615237, "flos": 609170595840.0, "grad_norm": 0.06521234046982781, "language_loss": 0.83369851, "learning_rate": 0.0001871428859660878, "loss": 0.84426749, "num_input_tokens_seen": 311953696, "router_z_loss_mlp": 0.09472656, "routerloss_mlp": 0.0, "step": 3762, "time_per_iteration": 2.765061855316162 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054846, "balance_loss_mlp": 1.04584002, "diversity_loss_mlp": 0.0, "epoch": 0.7239322816467872, "flos": 658987176960.0, "grad_norm": 0.06876344834189922, "language_loss": 0.81910485, "learning_rate": 0.00018689992739455975, "loss": 0.82965332, "num_input_tokens_seen": 312032752, "router_z_loss_mlp": 0.09008789, "routerloss_mlp": 0.0, "step": 3763, "time_per_iteration": 2.955744504928589 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050714, "balance_loss_mlp": 1.04123139, "diversity_loss_mlp": 0.0, "epoch": 0.7241246633320508, "flos": 969282878976.0, "grad_norm": 0.06967924844938471, "language_loss": 0.85903621, "learning_rate": 0.00018665709037926027, "loss": 0.86954343, "num_input_tokens_seen": 312120800, "router_z_loss_mlp": 0.0947876, "routerloss_mlp": 0.0, "step": 3764, "time_per_iteration": 3.306689977645874 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050074, "balance_loss_mlp": 1.04077554, "diversity_loss_mlp": 0.0, "epoch": 0.7243170450173143, "flos": 514995273216.0, "grad_norm": 0.07823184864923875, "language_loss": 0.8509047, "learning_rate": 0.00018641437501446694, "loss": 0.86140537, "num_input_tokens_seen": 312188416, "router_z_loss_mlp": 0.09295654, "routerloss_mlp": 0.0, "step": 3765, "time_per_iteration": 2.5606436729431152 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053259, "balance_loss_mlp": 1.04385924, "diversity_loss_mlp": 0.0, "epoch": 0.7245094267025779, "flos": 559746796032.0, "grad_norm": 0.07453327039799393, "language_loss": 0.8240428, "learning_rate": 0.0001861717813944104, "loss": 0.83457536, "num_input_tokens_seen": 312257792, "router_z_loss_mlp": 0.09393311, "routerloss_mlp": 0.0, "step": 3766, "time_per_iteration": 2.639479875564575 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052028, "balance_loss_mlp": 1.04260468, "diversity_loss_mlp": 0.0, "epoch": 0.7247018083878415, "flos": 612642134016.0, "grad_norm": 0.07462880824505752, "language_loss": 0.79635704, "learning_rate": 0.00018592930961327365, "loss": 0.80687737, "num_input_tokens_seen": 312328544, "router_z_loss_mlp": 0.09417725, "routerloss_mlp": 0.0, "step": 3767, "time_per_iteration": 2.71537446975708 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051032, "balance_loss_mlp": 1.04159653, "diversity_loss_mlp": 0.0, "epoch": 0.7248941900731051, "flos": 634676871168.0, "grad_norm": 0.06502387009338012, "language_loss": 0.88172042, "learning_rate": 0.00018568695976519273, "loss": 0.89223075, "num_input_tokens_seen": 312405888, "router_z_loss_mlp": 0.09423828, "routerloss_mlp": 0.0, "step": 3768, "time_per_iteration": 2.7851336002349854 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053572, "balance_loss_mlp": 1.04388046, "diversity_loss_mlp": 0.0, "epoch": 0.7250865717583687, "flos": 424941230592.0, "grad_norm": 0.07526480217284313, "language_loss": 0.80197144, "learning_rate": 0.00018544473194425593, "loss": 0.81250715, "num_input_tokens_seen": 312469552, "router_z_loss_mlp": 0.09686279, "routerloss_mlp": 0.0, "step": 3769, "time_per_iteration": 2.5187532901763916 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054045, "balance_loss_mlp": 1.044276, "diversity_loss_mlp": 0.0, "epoch": 0.7252789534436321, "flos": 635114068992.0, "grad_norm": 0.07238275679239237, "language_loss": 0.78824592, "learning_rate": 0.00018520262624450485, "loss": 0.79878634, "num_input_tokens_seen": 312548848, "router_z_loss_mlp": 0.09759521, "routerloss_mlp": 0.0, "step": 3770, "time_per_iteration": 2.8748114109039307 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057377, "balance_loss_mlp": 1.04787064, "diversity_loss_mlp": 0.0, "epoch": 0.7254713351288957, "flos": 617185930752.0, "grad_norm": 0.08918095477851212, "language_loss": 0.86894727, "learning_rate": 0.00018496064275993324, "loss": 0.87952113, "num_input_tokens_seen": 312622016, "router_z_loss_mlp": 0.09503174, "routerloss_mlp": 0.0, "step": 3771, "time_per_iteration": 2.824845314025879 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105481, "balance_loss_mlp": 1.04509437, "diversity_loss_mlp": 0.0, "epoch": 0.7256637168141593, "flos": 766986983424.0, "grad_norm": 0.06900224223805673, "language_loss": 0.82001221, "learning_rate": 0.00018471878158448686, "loss": 0.83056033, "num_input_tokens_seen": 312696960, "router_z_loss_mlp": 0.0970459, "routerloss_mlp": 0.0, "step": 3772, "time_per_iteration": 2.9548990726470947 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056235, "balance_loss_mlp": 1.04668033, "diversity_loss_mlp": 0.0, "epoch": 0.7258560984994229, "flos": 495559503360.0, "grad_norm": 0.058256019250052936, "language_loss": 0.84301949, "learning_rate": 0.00018447704281206512, "loss": 0.85358179, "num_input_tokens_seen": 312774352, "router_z_loss_mlp": 0.09545898, "routerloss_mlp": 0.0, "step": 3773, "time_per_iteration": 2.83591365814209 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055601, "balance_loss_mlp": 1.04598725, "diversity_loss_mlp": 0.0, "epoch": 0.7260484801846864, "flos": 530069681664.0, "grad_norm": 0.07576068763334884, "language_loss": 0.82763028, "learning_rate": 0.0001842354265365191, "loss": 0.83818638, "num_input_tokens_seen": 312849600, "router_z_loss_mlp": 0.09613037, "routerloss_mlp": 0.0, "step": 3774, "time_per_iteration": 2.68778657913208 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060495, "balance_loss_mlp": 1.05112517, "diversity_loss_mlp": 0.0, "epoch": 0.72624086186995, "flos": 624964128768.0, "grad_norm": 0.0805275617178238, "language_loss": 0.80610001, "learning_rate": 0.0001839939328516526, "loss": 0.81670493, "num_input_tokens_seen": 312922688, "router_z_loss_mlp": 0.09362793, "routerloss_mlp": 0.0, "step": 3775, "time_per_iteration": 2.7422258853912354 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00790959, "balance_loss_mlp": 1.33957541, "diversity_loss_mlp": 0.21958014, "epoch": 0.7264332435552135, "flos": 716522858496.0, "grad_norm": 0.033705672182060005, "language_loss": 0.8138454, "learning_rate": 0.0001837525618512218, "loss": 0.82175499, "num_input_tokens_seen": 312997728, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01138153, "step": 3776, "time_per_iteration": 2.9108829498291016 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053299, "balance_loss_mlp": 1.04409015, "diversity_loss_mlp": 0.0, "epoch": 0.7266256252404771, "flos": 681036968448.0, "grad_norm": 0.07511121424148261, "language_loss": 0.8321476, "learning_rate": 0.00018351131362893519, "loss": 0.84268057, "num_input_tokens_seen": 313067168, "router_z_loss_mlp": 0.09210205, "routerloss_mlp": 0.0, "step": 3777, "time_per_iteration": 2.789809465408325 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058309, "balance_loss_mlp": 1.04874849, "diversity_loss_mlp": 0.0, "epoch": 0.7268180069257407, "flos": 518906580480.0, "grad_norm": 0.08246656435114352, "language_loss": 0.80534494, "learning_rate": 0.00018327018827845364, "loss": 0.81592798, "num_input_tokens_seen": 313134688, "router_z_loss_mlp": 0.09558105, "routerloss_mlp": 0.0, "step": 3778, "time_per_iteration": 2.6201207637786865 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059499, "balance_loss_mlp": 1.0502367, "diversity_loss_mlp": 0.0, "epoch": 0.7270103886110042, "flos": 512662804992.0, "grad_norm": 0.060849425034284504, "language_loss": 0.87504601, "learning_rate": 0.00018302918589339036, "loss": 0.88564098, "num_input_tokens_seen": 313204816, "router_z_loss_mlp": 0.0925293, "routerloss_mlp": 0.0, "step": 3779, "time_per_iteration": 2.689378499984741 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061153, "balance_loss_mlp": 1.05198562, "diversity_loss_mlp": 0.0, "epoch": 0.7272027702962678, "flos": 546653919744.0, "grad_norm": 0.06743911417724738, "language_loss": 0.90138805, "learning_rate": 0.00018278830656731054, "loss": 0.91199952, "num_input_tokens_seen": 313274288, "router_z_loss_mlp": 0.0916748, "routerloss_mlp": 0.0, "step": 3780, "time_per_iteration": 2.6595706939697266 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056667, "balance_loss_mlp": 1.04758894, "diversity_loss_mlp": 0.0, "epoch": 0.7273951519815314, "flos": 593048521728.0, "grad_norm": 0.06124301945992682, "language_loss": 0.86350238, "learning_rate": 0.00018254755039373222, "loss": 0.87406909, "num_input_tokens_seen": 313344800, "router_z_loss_mlp": 0.09082031, "routerloss_mlp": 0.0, "step": 3781, "time_per_iteration": 2.7230565547943115 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062194, "balance_loss_mlp": 1.0530144, "diversity_loss_mlp": 0.0, "epoch": 0.727587533666795, "flos": 606012917760.0, "grad_norm": 0.07105415138975459, "language_loss": 0.83752382, "learning_rate": 0.0001823069174661252, "loss": 0.84814572, "num_input_tokens_seen": 313417840, "router_z_loss_mlp": 0.09179688, "routerloss_mlp": 0.0, "step": 3782, "time_per_iteration": 2.7941086292266846 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056585, "balance_loss_mlp": 1.04759097, "diversity_loss_mlp": 0.0, "epoch": 0.7277799153520584, "flos": 513021081600.0, "grad_norm": 0.06458866746308467, "language_loss": 0.78171599, "learning_rate": 0.00018206640787791112, "loss": 0.79228187, "num_input_tokens_seen": 313485936, "router_z_loss_mlp": 0.08990479, "routerloss_mlp": 0.0, "step": 3783, "time_per_iteration": 2.618022918701172 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062955, "balance_loss_mlp": 1.05387712, "diversity_loss_mlp": 0.0, "epoch": 0.727972297037322, "flos": 537756475392.0, "grad_norm": 0.06663972838638854, "language_loss": 0.85480422, "learning_rate": 0.00018182602172246416, "loss": 0.86543375, "num_input_tokens_seen": 313553136, "router_z_loss_mlp": 0.09075928, "routerloss_mlp": 0.0, "step": 3784, "time_per_iteration": 2.6113829612731934 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066913, "balance_loss_mlp": 1.05812776, "diversity_loss_mlp": 0.0, "epoch": 0.7281646787225856, "flos": 535038566400.0, "grad_norm": 0.07678107880467737, "language_loss": 0.76375031, "learning_rate": 0.00018158575909311075, "loss": 0.77441949, "num_input_tokens_seen": 313620128, "router_z_loss_mlp": 0.08795166, "routerloss_mlp": 0.0, "step": 3785, "time_per_iteration": 2.650192975997925 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061503, "balance_loss_mlp": 1.05243719, "diversity_loss_mlp": 0.0, "epoch": 0.7283570604078492, "flos": 625055533056.0, "grad_norm": 0.07604258502871962, "language_loss": 0.79732937, "learning_rate": 0.000181345620083129, "loss": 0.80794436, "num_input_tokens_seen": 313696432, "router_z_loss_mlp": 0.09075928, "routerloss_mlp": 0.0, "step": 3786, "time_per_iteration": 2.8074841499328613 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061193, "balance_loss_mlp": 1.05211556, "diversity_loss_mlp": 0.0, "epoch": 0.7285494420931128, "flos": 534173709312.0, "grad_norm": 0.0629164713746694, "language_loss": 0.86736983, "learning_rate": 0.00018110560478574927, "loss": 0.87798178, "num_input_tokens_seen": 313768416, "router_z_loss_mlp": 0.09075928, "routerloss_mlp": 0.0, "step": 3787, "time_per_iteration": 2.6831634044647217 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106216, "balance_loss_mlp": 1.05288577, "diversity_loss_mlp": 0.0, "epoch": 0.7287418237783763, "flos": 666548061696.0, "grad_norm": 0.07652228362928638, "language_loss": 0.80521822, "learning_rate": 0.0001808657132941533, "loss": 0.81583983, "num_input_tokens_seen": 313839888, "router_z_loss_mlp": 0.09277344, "routerloss_mlp": 0.0, "step": 3788, "time_per_iteration": 2.7681210041046143 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063738, "balance_loss_mlp": 1.05462408, "diversity_loss_mlp": 0.0, "epoch": 0.7289342054636399, "flos": 550602302976.0, "grad_norm": 0.06755228065084157, "language_loss": 0.83012414, "learning_rate": 0.00018062594570147572, "loss": 0.84076142, "num_input_tokens_seen": 313908832, "router_z_loss_mlp": 0.09106445, "routerloss_mlp": 0.0, "step": 3789, "time_per_iteration": 2.59897780418396 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069496, "balance_loss_mlp": 1.06051326, "diversity_loss_mlp": 0.0, "epoch": 0.7291265871489034, "flos": 687923145216.0, "grad_norm": 0.0602370632110868, "language_loss": 0.84944886, "learning_rate": 0.00018038630210080243, "loss": 0.86014384, "num_input_tokens_seen": 313982672, "router_z_loss_mlp": 0.08990479, "routerloss_mlp": 0.0, "step": 3790, "time_per_iteration": 2.8492085933685303 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061985, "balance_loss_mlp": 1.05299687, "diversity_loss_mlp": 0.0, "epoch": 0.729318968834167, "flos": 572664204288.0, "grad_norm": 0.06258751029355039, "language_loss": 0.85112703, "learning_rate": 0.0001801467825851712, "loss": 0.86174691, "num_input_tokens_seen": 314057184, "router_z_loss_mlp": 0.08984375, "routerloss_mlp": 0.0, "step": 3791, "time_per_iteration": 2.724008321762085 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063231, "balance_loss_mlp": 1.05412316, "diversity_loss_mlp": 0.0, "epoch": 0.7295113505194305, "flos": 586061028864.0, "grad_norm": 0.06759881980366181, "language_loss": 0.78407717, "learning_rate": 0.00017990738724757172, "loss": 0.79470944, "num_input_tokens_seen": 314137344, "router_z_loss_mlp": 0.09106445, "routerloss_mlp": 0.0, "step": 3792, "time_per_iteration": 2.8527557849884033 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065387, "balance_loss_mlp": 1.05635726, "diversity_loss_mlp": 0.0, "epoch": 0.7297037322046941, "flos": 707185645056.0, "grad_norm": 0.05706424828537789, "language_loss": 0.82412189, "learning_rate": 0.00017966811618094598, "loss": 0.83477581, "num_input_tokens_seen": 314214464, "router_z_loss_mlp": 0.09039307, "routerloss_mlp": 0.0, "step": 3793, "time_per_iteration": 2.891587734222412 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071379, "balance_loss_mlp": 1.06256318, "diversity_loss_mlp": 0.0, "epoch": 0.7298961138899577, "flos": 487292350464.0, "grad_norm": 0.0800044571001495, "language_loss": 0.84934509, "learning_rate": 0.00017942896947818664, "loss": 0.86005884, "num_input_tokens_seen": 314280432, "router_z_loss_mlp": 0.08825684, "routerloss_mlp": 0.0, "step": 3794, "time_per_iteration": 2.578213691711426 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01027287, "balance_loss_mlp": 1.02208936, "diversity_loss_mlp": 0.0, "epoch": 0.7300884955752213, "flos": 1365804260352.0, "grad_norm": 0.018812365315957286, "language_loss": 0.74825054, "learning_rate": 0.000179189947232139, "loss": 0.7585234, "num_input_tokens_seen": 314497152, "router_z_loss_mlp": 0.05200195, "routerloss_mlp": 0.0, "step": 3795, "time_per_iteration": 4.8731958866119385 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065802, "balance_loss_mlp": 1.05696881, "diversity_loss_mlp": 0.0, "epoch": 0.7302808772604849, "flos": 531806736384.0, "grad_norm": 0.08247331408198653, "language_loss": 0.85473979, "learning_rate": 0.00017895104953559947, "loss": 0.86539787, "num_input_tokens_seen": 314565488, "router_z_loss_mlp": 0.08850098, "routerloss_mlp": 0.0, "step": 3796, "time_per_iteration": 2.6150035858154297 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071519, "balance_loss_mlp": 1.06257856, "diversity_loss_mlp": 0.0, "epoch": 0.7304732589457483, "flos": 436171143168.0, "grad_norm": 0.0876682306683089, "language_loss": 0.90019357, "learning_rate": 0.00017871227648131672, "loss": 0.91090876, "num_input_tokens_seen": 314627392, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 3797, "time_per_iteration": 2.5456666946411133 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00790219, "balance_loss_mlp": 1.33552265, "diversity_loss_mlp": 0.2213349, "epoch": 0.7306656406310119, "flos": 451621080576.0, "grad_norm": 0.0295011086457174, "language_loss": 0.82969385, "learning_rate": 0.0001784736281619907, "loss": 0.83759606, "num_input_tokens_seen": 314695440, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01178985, "step": 3798, "time_per_iteration": 2.617690086364746 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064248, "balance_loss_mlp": 1.05507529, "diversity_loss_mlp": 0.0, "epoch": 0.7308580223162755, "flos": 512010491904.0, "grad_norm": 0.0761333988969544, "language_loss": 0.74143457, "learning_rate": 0.00017823510467027232, "loss": 0.75207704, "num_input_tokens_seen": 314772592, "router_z_loss_mlp": 0.09173584, "routerloss_mlp": 0.0, "step": 3799, "time_per_iteration": 2.74944806098938 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061269, "balance_loss_mlp": 1.05231094, "diversity_loss_mlp": 0.0, "epoch": 0.7310504040015391, "flos": 375423455232.0, "grad_norm": 0.07529945885516458, "language_loss": 0.7849319, "learning_rate": 0.00017799670609876516, "loss": 0.79554456, "num_input_tokens_seen": 314836192, "router_z_loss_mlp": 0.08966064, "routerloss_mlp": 0.0, "step": 3800, "time_per_iteration": 2.514719247817993 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106109, "balance_loss_mlp": 1.05228066, "diversity_loss_mlp": 0.0, "epoch": 0.7312427856868026, "flos": 549334752768.0, "grad_norm": 0.07202410794231434, "language_loss": 0.89223945, "learning_rate": 0.00017775843254002366, "loss": 0.90285027, "num_input_tokens_seen": 314908400, "router_z_loss_mlp": 0.0881958, "routerloss_mlp": 0.0, "step": 3801, "time_per_iteration": 2.742403507232666 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059589, "balance_loss_mlp": 1.05084491, "diversity_loss_mlp": 0.0, "epoch": 0.7314351673720662, "flos": 767238801408.0, "grad_norm": 0.060424645606399964, "language_loss": 0.83728462, "learning_rate": 0.00017752028408655367, "loss": 0.84788048, "num_input_tokens_seen": 314995280, "router_z_loss_mlp": 0.08752441, "routerloss_mlp": 0.0, "step": 3802, "time_per_iteration": 3.0845768451690674 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.007903, "balance_loss_mlp": 1.33712423, "diversity_loss_mlp": 0.22043222, "epoch": 0.7316275490573297, "flos": 486734012928.0, "grad_norm": 0.03351149815402085, "language_loss": 0.85395515, "learning_rate": 0.00017728226083081272, "loss": 0.86185813, "num_input_tokens_seen": 315063056, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01152179, "step": 3803, "time_per_iteration": 2.625450849533081 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064175, "balance_loss_mlp": 1.05536509, "diversity_loss_mlp": 0.0, "epoch": 0.7318199307425933, "flos": 473428592640.0, "grad_norm": 0.06980647435682294, "language_loss": 0.81371546, "learning_rate": 0.00017704436286520965, "loss": 0.82435715, "num_input_tokens_seen": 315128896, "router_z_loss_mlp": 0.0881958, "routerloss_mlp": 0.0, "step": 3804, "time_per_iteration": 2.5445075035095215 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064875, "balance_loss_mlp": 1.05574334, "diversity_loss_mlp": 0.0, "epoch": 0.7320123124278569, "flos": 549463233024.0, "grad_norm": 0.0710476755005787, "language_loss": 0.84313726, "learning_rate": 0.0001768065902821046, "loss": 0.85378599, "num_input_tokens_seen": 315198464, "router_z_loss_mlp": 0.09136963, "routerloss_mlp": 0.0, "step": 3805, "time_per_iteration": 2.6542673110961914 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060899, "balance_loss_mlp": 1.05200648, "diversity_loss_mlp": 0.0, "epoch": 0.7322046941131204, "flos": 570781416960.0, "grad_norm": 0.07797130890244271, "language_loss": 0.8206104, "learning_rate": 0.00017656894317380907, "loss": 0.83121943, "num_input_tokens_seen": 315270240, "router_z_loss_mlp": 0.08892822, "routerloss_mlp": 0.0, "step": 3806, "time_per_iteration": 2.701544761657715 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01020369, "balance_loss_mlp": 1.01498067, "diversity_loss_mlp": 0.0, "epoch": 0.732397075798384, "flos": 1469165548032.0, "grad_norm": 0.021367923460696967, "language_loss": 0.76031268, "learning_rate": 0.00017633142163258565, "loss": 0.7705164, "num_input_tokens_seen": 315502448, "router_z_loss_mlp": 0.05395508, "routerloss_mlp": 0.0, "step": 3807, "time_per_iteration": 5.001535177230835 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066451, "balance_loss_mlp": 1.05737972, "diversity_loss_mlp": 0.0, "epoch": 0.7325894574836476, "flos": 464862260736.0, "grad_norm": 0.08165775614059534, "language_loss": 0.83709639, "learning_rate": 0.00017609402575064875, "loss": 0.84776092, "num_input_tokens_seen": 315569472, "router_z_loss_mlp": 0.09069824, "routerloss_mlp": 0.0, "step": 3808, "time_per_iteration": 2.583564043045044 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061217, "balance_loss_mlp": 1.05229425, "diversity_loss_mlp": 0.0, "epoch": 0.7327818391689112, "flos": 495493065216.0, "grad_norm": 0.0811056502064105, "language_loss": 0.80930746, "learning_rate": 0.00017585675562016367, "loss": 0.81991959, "num_input_tokens_seen": 315637632, "router_z_loss_mlp": 0.0892334, "routerloss_mlp": 0.0, "step": 3809, "time_per_iteration": 2.6347053050994873 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0101136, "balance_loss_mlp": 1.00604343, "diversity_loss_mlp": 0.0, "epoch": 0.7329742208541746, "flos": 1433489508864.0, "grad_norm": 0.015405005389362274, "language_loss": 0.77212846, "learning_rate": 0.0001756196113332465, "loss": 0.78224206, "num_input_tokens_seen": 315863648, "router_z_loss_mlp": 0.05322266, "routerloss_mlp": 0.0, "step": 3810, "time_per_iteration": 4.809669017791748 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010632, "balance_loss_mlp": 1.05418134, "diversity_loss_mlp": 0.0, "epoch": 0.7331666025394382, "flos": 496889095680.0, "grad_norm": 0.08174261034044085, "language_loss": 0.85100114, "learning_rate": 0.00017538259298196474, "loss": 0.86163306, "num_input_tokens_seen": 315930752, "router_z_loss_mlp": 0.090271, "routerloss_mlp": 0.0, "step": 3811, "time_per_iteration": 2.5669541358947754 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066918, "balance_loss_mlp": 1.05802464, "diversity_loss_mlp": 0.0, "epoch": 0.7333589842247018, "flos": 538524785664.0, "grad_norm": 0.06518192792765873, "language_loss": 0.82332867, "learning_rate": 0.00017514570065833745, "loss": 0.83399785, "num_input_tokens_seen": 316006400, "router_z_loss_mlp": 0.08905029, "routerloss_mlp": 0.0, "step": 3812, "time_per_iteration": 2.7447328567504883 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071687, "balance_loss_mlp": 1.06259131, "diversity_loss_mlp": 0.0, "epoch": 0.7335513659099654, "flos": 491067836928.0, "grad_norm": 0.09580264059121266, "language_loss": 0.80788046, "learning_rate": 0.00017490893445433426, "loss": 0.81859732, "num_input_tokens_seen": 316075824, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 3813, "time_per_iteration": 2.6378085613250732 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064376, "balance_loss_mlp": 1.05522716, "diversity_loss_mlp": 0.0, "epoch": 0.733743747595229, "flos": 562150844928.0, "grad_norm": 0.07102449829418327, "language_loss": 0.81571025, "learning_rate": 0.00017467229446187587, "loss": 0.82635403, "num_input_tokens_seen": 316148336, "router_z_loss_mlp": 0.09143066, "routerloss_mlp": 0.0, "step": 3814, "time_per_iteration": 2.7120914459228516 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072677, "balance_loss_mlp": 1.06393909, "diversity_loss_mlp": 0.0, "epoch": 0.7339361292804925, "flos": 538581685248.0, "grad_norm": 0.07114012207935533, "language_loss": 0.81285048, "learning_rate": 0.00017443578077283424, "loss": 0.82357717, "num_input_tokens_seen": 316220960, "router_z_loss_mlp": 0.08746338, "routerloss_mlp": 0.0, "step": 3815, "time_per_iteration": 2.6395435333251953 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106649, "balance_loss_mlp": 1.05747199, "diversity_loss_mlp": 0.0, "epoch": 0.734128510965756, "flos": 548469895680.0, "grad_norm": 0.07483834875110257, "language_loss": 0.84961641, "learning_rate": 0.0001741993934790319, "loss": 0.86028135, "num_input_tokens_seen": 316295824, "router_z_loss_mlp": 0.09020996, "routerloss_mlp": 0.0, "step": 3816, "time_per_iteration": 2.726897716522217 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059942, "balance_loss_mlp": 1.05116272, "diversity_loss_mlp": 0.0, "epoch": 0.7343208926510196, "flos": 540066548736.0, "grad_norm": 0.07480496039033006, "language_loss": 0.84648383, "learning_rate": 0.00017396313267224273, "loss": 0.85708326, "num_input_tokens_seen": 316368064, "router_z_loss_mlp": 0.08782959, "routerloss_mlp": 0.0, "step": 3817, "time_per_iteration": 2.8066418170928955 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066831, "balance_loss_mlp": 1.05799198, "diversity_loss_mlp": 0.0, "epoch": 0.7345132743362832, "flos": 571095277056.0, "grad_norm": 0.0889487029403391, "language_loss": 0.8847158, "learning_rate": 0.0001737269984441912, "loss": 0.89538407, "num_input_tokens_seen": 316437440, "router_z_loss_mlp": 0.08856201, "routerloss_mlp": 0.0, "step": 3818, "time_per_iteration": 2.6318438053131104 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060197, "balance_loss_mlp": 1.05124998, "diversity_loss_mlp": 0.0, "epoch": 0.7347056560215467, "flos": 545403621888.0, "grad_norm": 0.07556044268941689, "language_loss": 0.85168499, "learning_rate": 0.00017349099088655263, "loss": 0.86228693, "num_input_tokens_seen": 316511936, "router_z_loss_mlp": 0.08953857, "routerloss_mlp": 0.0, "step": 3819, "time_per_iteration": 2.6988065242767334 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058709, "balance_loss_mlp": 1.05007255, "diversity_loss_mlp": 0.0, "epoch": 0.7348980377068103, "flos": 595949239296.0, "grad_norm": 0.06839680418094873, "language_loss": 0.80908042, "learning_rate": 0.00017325511009095375, "loss": 0.81966752, "num_input_tokens_seen": 316584304, "router_z_loss_mlp": 0.08642578, "routerloss_mlp": 0.0, "step": 3820, "time_per_iteration": 2.727027177810669 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057302, "balance_loss_mlp": 1.04837942, "diversity_loss_mlp": 0.0, "epoch": 0.7350904193920739, "flos": 538554521088.0, "grad_norm": 0.07744320065165705, "language_loss": 0.83646286, "learning_rate": 0.00017301935614897113, "loss": 0.84703583, "num_input_tokens_seen": 316659024, "router_z_loss_mlp": 0.08935547, "routerloss_mlp": 0.0, "step": 3821, "time_per_iteration": 2.6904449462890625 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059614, "balance_loss_mlp": 1.05071497, "diversity_loss_mlp": 0.0, "epoch": 0.7352828010773375, "flos": 512981434368.0, "grad_norm": 0.06367960554180149, "language_loss": 0.82050133, "learning_rate": 0.00017278372915213274, "loss": 0.83109748, "num_input_tokens_seen": 316732544, "router_z_loss_mlp": 0.08905029, "routerloss_mlp": 0.0, "step": 3822, "time_per_iteration": 2.715162515640259 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01009526, "balance_loss_mlp": 1.00437641, "diversity_loss_mlp": 0.0, "epoch": 0.735475182762601, "flos": 1553820848640.0, "grad_norm": 0.013680325571624621, "language_loss": 0.79893845, "learning_rate": 0.00017254822919191693, "loss": 0.80903369, "num_input_tokens_seen": 316967104, "router_z_loss_mlp": 0.05151367, "routerloss_mlp": 0.0, "step": 3823, "time_per_iteration": 4.962257146835327 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056616, "balance_loss_mlp": 1.04753208, "diversity_loss_mlp": 0.0, "epoch": 0.7356675644478645, "flos": 681308610048.0, "grad_norm": 0.08246165896918017, "language_loss": 0.80686677, "learning_rate": 0.00017231285635975314, "loss": 0.81743288, "num_input_tokens_seen": 317048304, "router_z_loss_mlp": 0.09082031, "routerloss_mlp": 0.0, "step": 3824, "time_per_iteration": 2.892613172531128 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060803, "balance_loss_mlp": 1.05131412, "diversity_loss_mlp": 0.0, "epoch": 0.7358599461331281, "flos": 515215157760.0, "grad_norm": 0.06805025721620432, "language_loss": 0.83387762, "learning_rate": 0.00017207761074702115, "loss": 0.84448564, "num_input_tokens_seen": 317115968, "router_z_loss_mlp": 0.09484863, "routerloss_mlp": 0.0, "step": 3825, "time_per_iteration": 2.600008964538574 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061591, "balance_loss_mlp": 1.05259085, "diversity_loss_mlp": 0.0, "epoch": 0.7360523278183917, "flos": 443973934080.0, "grad_norm": 0.06050130894095604, "language_loss": 0.84002912, "learning_rate": 0.0001718424924450514, "loss": 0.85064507, "num_input_tokens_seen": 317185680, "router_z_loss_mlp": 0.08996582, "routerloss_mlp": 0.0, "step": 3826, "time_per_iteration": 2.5992300510406494 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054783, "balance_loss_mlp": 1.04562807, "diversity_loss_mlp": 0.0, "epoch": 0.7362447095036553, "flos": 603423489024.0, "grad_norm": 0.057066515344493245, "language_loss": 0.86262274, "learning_rate": 0.00017160750154512482, "loss": 0.87317061, "num_input_tokens_seen": 317258800, "router_z_loss_mlp": 0.0914917, "routerloss_mlp": 0.0, "step": 3827, "time_per_iteration": 2.726304292678833 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00795034, "balance_loss_mlp": 1.34579134, "diversity_loss_mlp": 0.220893, "epoch": 0.7364370911889189, "flos": 553095184896.0, "grad_norm": 0.03015959834370855, "language_loss": 0.83901906, "learning_rate": 0.0001713726381384731, "loss": 0.84696937, "num_input_tokens_seen": 317334608, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01169185, "step": 3828, "time_per_iteration": 2.8043603897094727 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061645, "balance_loss_mlp": 1.05248344, "diversity_loss_mlp": 0.0, "epoch": 0.7366294728741823, "flos": 449061387264.0, "grad_norm": 0.06844777280948466, "language_loss": 0.81076348, "learning_rate": 0.00017113790231627812, "loss": 0.8213799, "num_input_tokens_seen": 317397504, "router_z_loss_mlp": 0.09155273, "routerloss_mlp": 0.0, "step": 3829, "time_per_iteration": 2.619093179702759 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0100728, "balance_loss_mlp": 1.0020107, "diversity_loss_mlp": 0.0, "epoch": 0.7368218545594459, "flos": 1535502500352.0, "grad_norm": 0.01400462839453399, "language_loss": 0.79258227, "learning_rate": 0.0001709032941696726, "loss": 0.80265498, "num_input_tokens_seen": 317611472, "router_z_loss_mlp": 0.05273438, "routerloss_mlp": 0.0, "step": 3830, "time_per_iteration": 4.812221527099609 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00792371, "balance_loss_mlp": 1.34191561, "diversity_loss_mlp": 0.21972378, "epoch": 0.7370142362447095, "flos": 515425130496.0, "grad_norm": 0.03330075510268521, "language_loss": 0.81812584, "learning_rate": 0.00017066881378973936, "loss": 0.82604957, "num_input_tokens_seen": 317681328, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01155161, "step": 3831, "time_per_iteration": 2.7056965827941895 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060657, "balance_loss_mlp": 1.05176377, "diversity_loss_mlp": 0.0, "epoch": 0.7372066179299731, "flos": 500805172224.0, "grad_norm": 0.07192956817041389, "language_loss": 0.83134949, "learning_rate": 0.00017043446126751189, "loss": 0.84195602, "num_input_tokens_seen": 317752336, "router_z_loss_mlp": 0.08905029, "routerloss_mlp": 0.0, "step": 3832, "time_per_iteration": 2.676421880722046 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060842, "balance_loss_mlp": 1.05175185, "diversity_loss_mlp": 0.0, "epoch": 0.7373989996152366, "flos": 558083893248.0, "grad_norm": 0.07065913186643534, "language_loss": 0.76922351, "learning_rate": 0.00017020023669397376, "loss": 0.77983195, "num_input_tokens_seen": 317824112, "router_z_loss_mlp": 0.09088135, "routerloss_mlp": 0.0, "step": 3833, "time_per_iteration": 2.67942214012146 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063044, "balance_loss_mlp": 1.0536567, "diversity_loss_mlp": 0.0, "epoch": 0.7375913813005002, "flos": 506777306112.0, "grad_norm": 0.07582868630536281, "language_loss": 0.81676751, "learning_rate": 0.0001699661401600589, "loss": 0.82739794, "num_input_tokens_seen": 317889120, "router_z_loss_mlp": 0.09381104, "routerloss_mlp": 0.0, "step": 3834, "time_per_iteration": 2.5813028812408447 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00791828, "balance_loss_mlp": 1.34016216, "diversity_loss_mlp": 0.22067872, "epoch": 0.7377837629857638, "flos": 486183015936.0, "grad_norm": 0.03104422851251126, "language_loss": 0.78392982, "learning_rate": 0.00016973217175665205, "loss": 0.79184818, "num_input_tokens_seen": 317953792, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01140742, "step": 3835, "time_per_iteration": 2.622943639755249 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01002245, "balance_loss_mlp": 0.99702322, "diversity_loss_mlp": 0.0, "epoch": 0.7379761446710273, "flos": 1414693942272.0, "grad_norm": 0.013207371532760371, "language_loss": 0.8116616, "learning_rate": 0.00016949833157458755, "loss": 0.82168412, "num_input_tokens_seen": 318184848, "router_z_loss_mlp": 0.05224609, "routerloss_mlp": 0.0, "step": 3836, "time_per_iteration": 4.931336402893066 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060295, "balance_loss_mlp": 1.05126452, "diversity_loss_mlp": 0.0, "epoch": 0.7381685263562909, "flos": 629737721856.0, "grad_norm": 0.06649751574670516, "language_loss": 0.84498501, "learning_rate": 0.00016926461970465047, "loss": 0.85558796, "num_input_tokens_seen": 318259296, "router_z_loss_mlp": 0.09033203, "routerloss_mlp": 0.0, "step": 3837, "time_per_iteration": 2.765747547149658 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059108, "balance_loss_mlp": 1.04992294, "diversity_loss_mlp": 0.0, "epoch": 0.7383609080415544, "flos": 739224589824.0, "grad_norm": 0.0574260047104924, "language_loss": 0.84358233, "learning_rate": 0.00016903103623757516, "loss": 0.85417342, "num_input_tokens_seen": 318344704, "router_z_loss_mlp": 0.09185791, "routerloss_mlp": 0.0, "step": 3838, "time_per_iteration": 3.069658041000366 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060146, "balance_loss_mlp": 1.05106258, "diversity_loss_mlp": 0.0, "epoch": 0.738553289726818, "flos": 550206950400.0, "grad_norm": 0.19052913382225448, "language_loss": 0.80133057, "learning_rate": 0.00016879758126404738, "loss": 0.81193197, "num_input_tokens_seen": 318416128, "router_z_loss_mlp": 0.09088135, "routerloss_mlp": 0.0, "step": 3839, "time_per_iteration": 2.689941167831421 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00789085, "balance_loss_mlp": 1.33350182, "diversity_loss_mlp": 0.2223025, "epoch": 0.7387456714120816, "flos": 910294640640.0, "grad_norm": 0.03551016649676842, "language_loss": 0.79851139, "learning_rate": 0.00016856425487470216, "loss": 0.80640227, "num_input_tokens_seen": 318498128, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01118332, "step": 3840, "time_per_iteration": 3.1254615783691406 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064553, "balance_loss_mlp": 1.05543303, "diversity_loss_mlp": 0.0, "epoch": 0.7389380530973452, "flos": 852684807168.0, "grad_norm": 0.0706997471436485, "language_loss": 0.79199183, "learning_rate": 0.00016833105716012486, "loss": 0.8026374, "num_input_tokens_seen": 318578048, "router_z_loss_mlp": 0.09118652, "routerloss_mlp": 0.0, "step": 3841, "time_per_iteration": 3.138193368911743 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063082, "balance_loss_mlp": 1.05398655, "diversity_loss_mlp": 0.0, "epoch": 0.7391304347826086, "flos": 817026020352.0, "grad_norm": 0.06630465632536123, "language_loss": 0.85135829, "learning_rate": 0.00016809798821085088, "loss": 0.86198914, "num_input_tokens_seen": 318654784, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 3842, "time_per_iteration": 3.0023772716522217 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01070258, "balance_loss_mlp": 1.06117415, "diversity_loss_mlp": 0.0, "epoch": 0.7393228164678722, "flos": 572819848704.0, "grad_norm": 0.05652902477854722, "language_loss": 0.89046443, "learning_rate": 0.00016786504811736565, "loss": 0.90116704, "num_input_tokens_seen": 318727680, "router_z_loss_mlp": 0.09088135, "routerloss_mlp": 0.0, "step": 3843, "time_per_iteration": 2.706385374069214 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063666, "balance_loss_mlp": 1.05483222, "diversity_loss_mlp": 0.0, "epoch": 0.7395151981531358, "flos": 685237169664.0, "grad_norm": 0.0599118075718357, "language_loss": 0.82577473, "learning_rate": 0.00016763223697010442, "loss": 0.83641136, "num_input_tokens_seen": 318807568, "router_z_loss_mlp": 0.08843994, "routerloss_mlp": 0.0, "step": 3844, "time_per_iteration": 3.0668578147888184 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065987, "balance_loss_mlp": 1.05714738, "diversity_loss_mlp": 0.0, "epoch": 0.7397075798383994, "flos": 556366662144.0, "grad_norm": 0.06587022409921209, "language_loss": 0.84292293, "learning_rate": 0.00016739955485945256, "loss": 0.8535828, "num_input_tokens_seen": 318881792, "router_z_loss_mlp": 0.08843994, "routerloss_mlp": 0.0, "step": 3845, "time_per_iteration": 2.76232647895813 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066755, "balance_loss_mlp": 1.05776656, "diversity_loss_mlp": 0.0, "epoch": 0.739899961523663, "flos": 546782400000.0, "grad_norm": 0.07863227392455628, "language_loss": 0.85949242, "learning_rate": 0.00016716700187574513, "loss": 0.87015998, "num_input_tokens_seen": 318951552, "router_z_loss_mlp": 0.08996582, "routerloss_mlp": 0.0, "step": 3846, "time_per_iteration": 2.6615161895751953 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068464, "balance_loss_mlp": 1.05967295, "diversity_loss_mlp": 0.0, "epoch": 0.7400923432089265, "flos": 609190419456.0, "grad_norm": 0.0694717633397352, "language_loss": 0.8384943, "learning_rate": 0.0001669345781092675, "loss": 0.84917903, "num_input_tokens_seen": 319022304, "router_z_loss_mlp": 0.08807373, "routerloss_mlp": 0.0, "step": 3847, "time_per_iteration": 2.708287477493286 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068988, "balance_loss_mlp": 1.06022048, "diversity_loss_mlp": 0.0, "epoch": 0.7402847248941901, "flos": 591007518720.0, "grad_norm": 0.08739626570818541, "language_loss": 0.87128854, "learning_rate": 0.0001667022836502546, "loss": 0.88197839, "num_input_tokens_seen": 319093200, "router_z_loss_mlp": 0.08770752, "routerloss_mlp": 0.0, "step": 3848, "time_per_iteration": 2.768453598022461 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01071713, "balance_loss_mlp": 1.06293964, "diversity_loss_mlp": 0.0, "epoch": 0.7404771065794536, "flos": 477369635328.0, "grad_norm": 0.07849103844245357, "language_loss": 0.83004302, "learning_rate": 0.00016647011858889077, "loss": 0.84076011, "num_input_tokens_seen": 319159712, "router_z_loss_mlp": 0.08776855, "routerloss_mlp": 0.0, "step": 3849, "time_per_iteration": 2.553321123123169 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066558, "balance_loss_mlp": 1.05774295, "diversity_loss_mlp": 0.0, "epoch": 0.7406694882647172, "flos": 496446755328.0, "grad_norm": 0.0747699795491948, "language_loss": 0.85671914, "learning_rate": 0.00016623808301531056, "loss": 0.86738473, "num_input_tokens_seen": 319230544, "router_z_loss_mlp": 0.0881958, "routerloss_mlp": 0.0, "step": 3850, "time_per_iteration": 2.6675972938537598 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072662, "balance_loss_mlp": 1.06376278, "diversity_loss_mlp": 0.0, "epoch": 0.7408618699499807, "flos": 562205173248.0, "grad_norm": 0.08247164679043814, "language_loss": 0.79259217, "learning_rate": 0.00016600617701959842, "loss": 0.8033188, "num_input_tokens_seen": 319305440, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 3851, "time_per_iteration": 2.7360141277313232 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01035221, "balance_loss_mlp": 1.03028595, "diversity_loss_mlp": 0.0, "epoch": 0.7410542516352443, "flos": 1388228834304.0, "grad_norm": 0.02428572869696352, "language_loss": 0.78843814, "learning_rate": 0.00016577440069178811, "loss": 0.79879034, "num_input_tokens_seen": 319534384, "router_z_loss_mlp": 0.04931641, "routerloss_mlp": 0.0, "step": 3852, "time_per_iteration": 4.992321968078613 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066311, "balance_loss_mlp": 1.05746007, "diversity_loss_mlp": 0.0, "epoch": 0.7412466333205079, "flos": 669999776256.0, "grad_norm": 0.06380286775900439, "language_loss": 0.81274605, "learning_rate": 0.00016554275412186315, "loss": 0.8234092, "num_input_tokens_seen": 319610960, "router_z_loss_mlp": 0.08862305, "routerloss_mlp": 0.0, "step": 3853, "time_per_iteration": 2.82212495803833 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065457, "balance_loss_mlp": 1.05660534, "diversity_loss_mlp": 0.0, "epoch": 0.7414390150057715, "flos": 489293706240.0, "grad_norm": 0.08235676445627264, "language_loss": 0.80846745, "learning_rate": 0.0001653112373997568, "loss": 0.81912202, "num_input_tokens_seen": 319683872, "router_z_loss_mlp": 0.08856201, "routerloss_mlp": 0.0, "step": 3854, "time_per_iteration": 2.6886162757873535 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01072808, "balance_loss_mlp": 1.06417763, "diversity_loss_mlp": 0.0, "epoch": 0.7416313966910351, "flos": 599393613312.0, "grad_norm": 0.0787808176004402, "language_loss": 0.7459085, "learning_rate": 0.0001650798506153517, "loss": 0.75663662, "num_input_tokens_seen": 319750032, "router_z_loss_mlp": 0.08636475, "routerloss_mlp": 0.0, "step": 3855, "time_per_iteration": 2.699655294418335 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064176, "balance_loss_mlp": 1.05534911, "diversity_loss_mlp": 0.0, "epoch": 0.7418237783762985, "flos": 542539980288.0, "grad_norm": 0.13185112675918914, "language_loss": 0.84102911, "learning_rate": 0.00016484859385848023, "loss": 0.85167086, "num_input_tokens_seen": 319818864, "router_z_loss_mlp": 0.08837891, "routerloss_mlp": 0.0, "step": 3856, "time_per_iteration": 2.6237292289733887 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066087, "balance_loss_mlp": 1.05749846, "diversity_loss_mlp": 0.0, "epoch": 0.7420161600615621, "flos": 544136071680.0, "grad_norm": 0.0735312090287519, "language_loss": 0.77380371, "learning_rate": 0.0001646174672189243, "loss": 0.7844646, "num_input_tokens_seen": 319888816, "router_z_loss_mlp": 0.08599854, "routerloss_mlp": 0.0, "step": 3857, "time_per_iteration": 2.662250518798828 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066781, "balance_loss_mlp": 1.05808437, "diversity_loss_mlp": 0.0, "epoch": 0.7422085417468257, "flos": 527178875904.0, "grad_norm": 0.07158580991852644, "language_loss": 0.80202585, "learning_rate": 0.00016438647078641488, "loss": 0.81269372, "num_input_tokens_seen": 319956176, "router_z_loss_mlp": 0.08709717, "routerloss_mlp": 0.0, "step": 3858, "time_per_iteration": 2.5815234184265137 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061243, "balance_loss_mlp": 1.05223656, "diversity_loss_mlp": 0.0, "epoch": 0.7424009234320893, "flos": 508674774528.0, "grad_norm": 0.07922307514532904, "language_loss": 0.82879561, "learning_rate": 0.00016415560465063344, "loss": 0.83940804, "num_input_tokens_seen": 320028560, "router_z_loss_mlp": 0.09008789, "routerloss_mlp": 0.0, "step": 3859, "time_per_iteration": 2.708585739135742 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057118, "balance_loss_mlp": 1.04814172, "diversity_loss_mlp": 0.0, "epoch": 0.7425933051173528, "flos": 512598564864.0, "grad_norm": 0.07844823875052143, "language_loss": 0.79364371, "learning_rate": 0.0001639248689012095, "loss": 0.80421484, "num_input_tokens_seen": 320096112, "router_z_loss_mlp": 0.08984375, "routerloss_mlp": 0.0, "step": 3860, "time_per_iteration": 2.58583927154541 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063847, "balance_loss_mlp": 1.05484664, "diversity_loss_mlp": 0.0, "epoch": 0.7427856868026164, "flos": 458302053888.0, "grad_norm": 0.0625994675611715, "language_loss": 0.87600327, "learning_rate": 0.00016369426362772271, "loss": 0.88664174, "num_input_tokens_seen": 320168992, "router_z_loss_mlp": 0.09002686, "routerloss_mlp": 0.0, "step": 3861, "time_per_iteration": 2.7810909748077393 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058337, "balance_loss_mlp": 1.04926515, "diversity_loss_mlp": 0.0, "epoch": 0.74297806848788, "flos": 605019580416.0, "grad_norm": 0.06941058470153043, "language_loss": 0.80742699, "learning_rate": 0.00016346378891970233, "loss": 0.81801033, "num_input_tokens_seen": 320247264, "router_z_loss_mlp": 0.09063721, "routerloss_mlp": 0.0, "step": 3862, "time_per_iteration": 2.846928596496582 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063331, "balance_loss_mlp": 1.05435514, "diversity_loss_mlp": 0.0, "epoch": 0.7431704501731435, "flos": 891390044160.0, "grad_norm": 0.0684493510726064, "language_loss": 0.81710279, "learning_rate": 0.00016323344486662633, "loss": 0.82773608, "num_input_tokens_seen": 320338992, "router_z_loss_mlp": 0.08984375, "routerloss_mlp": 0.0, "step": 3863, "time_per_iteration": 3.331202745437622 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061695, "balance_loss_mlp": 1.05259883, "diversity_loss_mlp": 0.0, "epoch": 0.7433628318584071, "flos": 592163841024.0, "grad_norm": 0.05806816249285044, "language_loss": 0.78816247, "learning_rate": 0.00016300323155792247, "loss": 0.79877937, "num_input_tokens_seen": 320422096, "router_z_loss_mlp": 0.09100342, "routerloss_mlp": 0.0, "step": 3864, "time_per_iteration": 2.872833490371704 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060231, "balance_loss_mlp": 1.05139732, "diversity_loss_mlp": 0.0, "epoch": 0.7435552135436706, "flos": 477154520064.0, "grad_norm": 0.06583078508607046, "language_loss": 0.88677347, "learning_rate": 0.00016277314908296687, "loss": 0.89737576, "num_input_tokens_seen": 320492640, "router_z_loss_mlp": 0.08837891, "routerloss_mlp": 0.0, "step": 3865, "time_per_iteration": 2.6268508434295654 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062318, "balance_loss_mlp": 1.05286467, "diversity_loss_mlp": 0.0, "epoch": 0.7437475952289342, "flos": 673184618496.0, "grad_norm": 0.08180248385301583, "language_loss": 0.7621361, "learning_rate": 0.00016254319753108604, "loss": 0.77275932, "num_input_tokens_seen": 320565264, "router_z_loss_mlp": 0.09442139, "routerloss_mlp": 0.0, "step": 3866, "time_per_iteration": 2.8856914043426514 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062277, "balance_loss_mlp": 1.05305004, "diversity_loss_mlp": 0.0, "epoch": 0.7439399769141978, "flos": 770428786176.0, "grad_norm": 0.07310249763973194, "language_loss": 0.77018058, "learning_rate": 0.00016231337699155492, "loss": 0.78080332, "num_input_tokens_seen": 320647584, "router_z_loss_mlp": 0.09228516, "routerloss_mlp": 0.0, "step": 3867, "time_per_iteration": 2.975250244140625 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059189, "balance_loss_mlp": 1.04974771, "diversity_loss_mlp": 0.0, "epoch": 0.7441323585994614, "flos": 647777088000.0, "grad_norm": 0.07083990267041149, "language_loss": 0.78228271, "learning_rate": 0.0001620836875535977, "loss": 0.79287452, "num_input_tokens_seen": 320722752, "router_z_loss_mlp": 0.09436035, "routerloss_mlp": 0.0, "step": 3868, "time_per_iteration": 2.856765031814575 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105862, "balance_loss_mlp": 1.04925001, "diversity_loss_mlp": 0.0, "epoch": 0.7443247402847248, "flos": 565372763136.0, "grad_norm": 0.058820941096758894, "language_loss": 0.80752689, "learning_rate": 0.00016185412930638766, "loss": 0.81811309, "num_input_tokens_seen": 320802496, "router_z_loss_mlp": 0.09356689, "routerloss_mlp": 0.0, "step": 3869, "time_per_iteration": 2.7962300777435303 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060986, "balance_loss_mlp": 1.05180645, "diversity_loss_mlp": 0.0, "epoch": 0.7445171219699884, "flos": 578529879552.0, "grad_norm": 0.09216022180459393, "language_loss": 0.82565176, "learning_rate": 0.00016162470233904765, "loss": 0.83626163, "num_input_tokens_seen": 320872496, "router_z_loss_mlp": 0.09185791, "routerloss_mlp": 0.0, "step": 3870, "time_per_iteration": 2.727376937866211 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059095, "balance_loss_mlp": 1.05008888, "diversity_loss_mlp": 0.0, "epoch": 0.744709503655252, "flos": 618875997696.0, "grad_norm": 0.08871714462123159, "language_loss": 0.82108277, "learning_rate": 0.00016139540674064856, "loss": 0.83167374, "num_input_tokens_seen": 320944992, "router_z_loss_mlp": 0.09020996, "routerloss_mlp": 0.0, "step": 3871, "time_per_iteration": 2.747559070587158 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055678, "balance_loss_mlp": 1.04671371, "diversity_loss_mlp": 0.0, "epoch": 0.7449018853405156, "flos": 528619322880.0, "grad_norm": 0.063692065795828, "language_loss": 0.7763024, "learning_rate": 0.00016116624260021113, "loss": 0.78685915, "num_input_tokens_seen": 321020208, "router_z_loss_mlp": 0.08966064, "routerloss_mlp": 0.0, "step": 3872, "time_per_iteration": 2.75909423828125 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106192, "balance_loss_mlp": 1.0528599, "diversity_loss_mlp": 0.0, "epoch": 0.7450942670257792, "flos": 433314842112.0, "grad_norm": 0.06099997691226976, "language_loss": 0.83786505, "learning_rate": 0.0001609372100067046, "loss": 0.84848428, "num_input_tokens_seen": 321085984, "router_z_loss_mlp": 0.09057617, "routerloss_mlp": 0.0, "step": 3873, "time_per_iteration": 2.5251874923706055 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00796431, "balance_loss_mlp": 1.34714556, "diversity_loss_mlp": 0.22299039, "epoch": 0.7452866487110427, "flos": 696882258432.0, "grad_norm": 0.03925838692514683, "language_loss": 0.85007972, "learning_rate": 0.0001607083090490475, "loss": 0.85804403, "num_input_tokens_seen": 321163200, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01136341, "step": 3874, "time_per_iteration": 2.8896329402923584 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061928, "balance_loss_mlp": 1.0527246, "diversity_loss_mlp": 0.0, "epoch": 0.7454790303963063, "flos": 512210552832.0, "grad_norm": 0.07963892031444339, "language_loss": 0.80322075, "learning_rate": 0.00016047953981610714, "loss": 0.81384003, "num_input_tokens_seen": 321237328, "router_z_loss_mlp": 0.09197998, "routerloss_mlp": 0.0, "step": 3875, "time_per_iteration": 2.7198143005371094 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0102908, "balance_loss_mlp": 1.02416849, "diversity_loss_mlp": 0.0, "epoch": 0.7456714120815698, "flos": 1325949668352.0, "grad_norm": 0.01953041960218584, "language_loss": 0.7972964, "learning_rate": 0.00016025090239669916, "loss": 0.80758721, "num_input_tokens_seen": 321456192, "router_z_loss_mlp": 0.04907227, "routerloss_mlp": 0.0, "step": 3876, "time_per_iteration": 5.047106981277466 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105976, "balance_loss_mlp": 1.05069435, "diversity_loss_mlp": 0.0, "epoch": 0.7458637937668334, "flos": 721711627776.0, "grad_norm": 0.07139005535531126, "language_loss": 0.80606306, "learning_rate": 0.0001600223968795889, "loss": 0.81666064, "num_input_tokens_seen": 321530560, "router_z_loss_mlp": 0.09075928, "routerloss_mlp": 0.0, "step": 3877, "time_per_iteration": 2.8899221420288086 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01027214, "balance_loss_mlp": 1.02230287, "diversity_loss_mlp": 0.0, "epoch": 0.746056175452097, "flos": 1501580395008.0, "grad_norm": 0.018847716252117216, "language_loss": 0.75696075, "learning_rate": 0.00015979402335349004, "loss": 0.76723289, "num_input_tokens_seen": 321760928, "router_z_loss_mlp": 0.04907227, "routerloss_mlp": 0.0, "step": 3878, "time_per_iteration": 4.949044466018677 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063533, "balance_loss_mlp": 1.05449665, "diversity_loss_mlp": 0.0, "epoch": 0.7462485571373605, "flos": 520245711360.0, "grad_norm": 0.08037956070996295, "language_loss": 0.8220886, "learning_rate": 0.00015956578190706483, "loss": 0.83272392, "num_input_tokens_seen": 321833248, "router_z_loss_mlp": 0.09039307, "routerloss_mlp": 0.0, "step": 3879, "time_per_iteration": 2.679077386856079 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058904, "balance_loss_mlp": 1.04966509, "diversity_loss_mlp": 0.0, "epoch": 0.7464409388226241, "flos": 481206790656.0, "grad_norm": 0.07423526276361143, "language_loss": 0.75933188, "learning_rate": 0.00015933767262892468, "loss": 0.76992095, "num_input_tokens_seen": 321905904, "router_z_loss_mlp": 0.09240723, "routerloss_mlp": 0.0, "step": 3880, "time_per_iteration": 2.725120782852173 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061626, "balance_loss_mlp": 1.05248249, "diversity_loss_mlp": 0.0, "epoch": 0.7466333205078877, "flos": 486761177088.0, "grad_norm": 0.08122487442608403, "language_loss": 0.81791377, "learning_rate": 0.00015910969560762927, "loss": 0.82853001, "num_input_tokens_seen": 321971920, "router_z_loss_mlp": 0.09130859, "routerloss_mlp": 0.0, "step": 3881, "time_per_iteration": 2.5659735202789307 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061027, "balance_loss_mlp": 1.05212796, "diversity_loss_mlp": 0.0, "epoch": 0.7468257021931513, "flos": 611293091328.0, "grad_norm": 0.06269003532148706, "language_loss": 0.83085567, "learning_rate": 0.00015888185093168727, "loss": 0.84146595, "num_input_tokens_seen": 322041904, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 3882, "time_per_iteration": 2.7333316802978516 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064922, "balance_loss_mlp": 1.0554266, "diversity_loss_mlp": 0.0, "epoch": 0.7470180838784147, "flos": 533459727360.0, "grad_norm": 0.06569405974283654, "language_loss": 0.81109202, "learning_rate": 0.00015865413868955581, "loss": 0.82174122, "num_input_tokens_seen": 322110816, "router_z_loss_mlp": 0.09490967, "routerloss_mlp": 0.0, "step": 3883, "time_per_iteration": 2.6078059673309326 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058237, "balance_loss_mlp": 1.04946291, "diversity_loss_mlp": 0.0, "epoch": 0.7472104655636783, "flos": 739338388992.0, "grad_norm": 0.057634664266444945, "language_loss": 0.82803142, "learning_rate": 0.00015842655896964054, "loss": 0.83861375, "num_input_tokens_seen": 322192704, "router_z_loss_mlp": 0.08776855, "routerloss_mlp": 0.0, "step": 3884, "time_per_iteration": 3.042433977127075 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061863, "balance_loss_mlp": 1.0528096, "diversity_loss_mlp": 0.0, "epoch": 0.7474028472489419, "flos": 640305409536.0, "grad_norm": 0.07244796431130596, "language_loss": 0.73654252, "learning_rate": 0.00015819911186029567, "loss": 0.74716115, "num_input_tokens_seen": 322263888, "router_z_loss_mlp": 0.09057617, "routerloss_mlp": 0.0, "step": 3885, "time_per_iteration": 2.8399569988250732 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063739, "balance_loss_mlp": 1.05458951, "diversity_loss_mlp": 0.0, "epoch": 0.7475952289342055, "flos": 590249120256.0, "grad_norm": 0.0730187367037383, "language_loss": 0.86386681, "learning_rate": 0.00015797179744982443, "loss": 0.87450415, "num_input_tokens_seen": 322331936, "router_z_loss_mlp": 0.09143066, "routerloss_mlp": 0.0, "step": 3886, "time_per_iteration": 2.6979753971099854 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068538, "balance_loss_mlp": 1.05947804, "diversity_loss_mlp": 0.0, "epoch": 0.7477876106194691, "flos": 488191712256.0, "grad_norm": 0.06196383449999257, "language_loss": 0.78900141, "learning_rate": 0.00015774461582647765, "loss": 0.79968679, "num_input_tokens_seen": 322402032, "router_z_loss_mlp": 0.09063721, "routerloss_mlp": 0.0, "step": 3887, "time_per_iteration": 2.6235530376434326 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01067008, "balance_loss_mlp": 1.05791271, "diversity_loss_mlp": 0.0, "epoch": 0.7479799923047326, "flos": 554733494784.0, "grad_norm": 0.07428746170121639, "language_loss": 0.81271255, "learning_rate": 0.00015751756707845505, "loss": 0.82338268, "num_input_tokens_seen": 322472512, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 3888, "time_per_iteration": 2.654217481613159 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066279, "balance_loss_mlp": 1.05733204, "diversity_loss_mlp": 0.0, "epoch": 0.7481723739899961, "flos": 767387105280.0, "grad_norm": 0.06349901375293318, "language_loss": 0.8820529, "learning_rate": 0.00015729065129390502, "loss": 0.89271569, "num_input_tokens_seen": 322555104, "router_z_loss_mlp": 0.08953857, "routerloss_mlp": 0.0, "step": 3889, "time_per_iteration": 2.990723133087158 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0107017, "balance_loss_mlp": 1.06125295, "diversity_loss_mlp": 0.0, "epoch": 0.7483647556752597, "flos": 496172542464.0, "grad_norm": 0.10644115001559669, "language_loss": 0.82281494, "learning_rate": 0.0001570638685609241, "loss": 0.83351666, "num_input_tokens_seen": 322621904, "router_z_loss_mlp": 0.08917236, "routerloss_mlp": 0.0, "step": 3890, "time_per_iteration": 2.562049627304077 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064882, "balance_loss_mlp": 1.0558815, "diversity_loss_mlp": 0.0, "epoch": 0.7485571373605233, "flos": 472850431488.0, "grad_norm": 0.07005408827456952, "language_loss": 0.80632579, "learning_rate": 0.00015683721896755693, "loss": 0.81697452, "num_input_tokens_seen": 322688928, "router_z_loss_mlp": 0.09014893, "routerloss_mlp": 0.0, "step": 3891, "time_per_iteration": 2.5688047409057617 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01018069, "balance_loss_mlp": 1.01291943, "diversity_loss_mlp": 0.0, "epoch": 0.7487495190457868, "flos": 1554473161728.0, "grad_norm": 0.021126139986013294, "language_loss": 0.82210493, "learning_rate": 0.00015661070260179682, "loss": 0.83228564, "num_input_tokens_seen": 322928464, "router_z_loss_mlp": 0.05151367, "routerloss_mlp": 0.0, "step": 3892, "time_per_iteration": 4.9241249561309814 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063391, "balance_loss_mlp": 1.05425954, "diversity_loss_mlp": 0.0, "epoch": 0.7489419007310504, "flos": 581845773312.0, "grad_norm": 0.07047459901443781, "language_loss": 0.85042292, "learning_rate": 0.00015638431955158528, "loss": 0.8610568, "num_input_tokens_seen": 323002672, "router_z_loss_mlp": 0.09136963, "routerloss_mlp": 0.0, "step": 3893, "time_per_iteration": 2.696835517883301 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059789, "balance_loss_mlp": 1.05092609, "diversity_loss_mlp": 0.0, "epoch": 0.749134282416314, "flos": 567576751104.0, "grad_norm": 0.07429691825865621, "language_loss": 0.81044436, "learning_rate": 0.00015615806990481186, "loss": 0.8210423, "num_input_tokens_seen": 323076480, "router_z_loss_mlp": 0.08868408, "routerloss_mlp": 0.0, "step": 3894, "time_per_iteration": 2.721975088119507 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061695, "balance_loss_mlp": 1.05259371, "diversity_loss_mlp": 0.0, "epoch": 0.7493266641015776, "flos": 533061803520.0, "grad_norm": 0.05332768573038703, "language_loss": 0.84447378, "learning_rate": 0.00015593195374931452, "loss": 0.85509074, "num_input_tokens_seen": 323151840, "router_z_loss_mlp": 0.09100342, "routerloss_mlp": 0.0, "step": 3895, "time_per_iteration": 2.724210500717163 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057311, "balance_loss_mlp": 1.04820967, "diversity_loss_mlp": 0.0, "epoch": 0.7495190457868411, "flos": 523613362176.0, "grad_norm": 0.08170178598725314, "language_loss": 0.79939067, "learning_rate": 0.00015570597117287922, "loss": 0.80996376, "num_input_tokens_seen": 323223376, "router_z_loss_mlp": 0.09106445, "routerloss_mlp": 0.0, "step": 3896, "time_per_iteration": 2.6550590991973877 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058835, "balance_loss_mlp": 1.04970384, "diversity_loss_mlp": 0.0, "epoch": 0.7497114274721046, "flos": 514187315712.0, "grad_norm": 0.07111999470543245, "language_loss": 0.77950025, "learning_rate": 0.0001554801222632406, "loss": 0.79008865, "num_input_tokens_seen": 323290288, "router_z_loss_mlp": 0.09130859, "routerloss_mlp": 0.0, "step": 3897, "time_per_iteration": 2.5913069248199463 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058781, "balance_loss_mlp": 1.04961967, "diversity_loss_mlp": 0.0, "epoch": 0.7499038091573682, "flos": 495006308352.0, "grad_norm": 0.07004004520272819, "language_loss": 0.8521589, "learning_rate": 0.00015525440710808052, "loss": 0.86274672, "num_input_tokens_seen": 323359568, "router_z_loss_mlp": 0.09155273, "routerloss_mlp": 0.0, "step": 3898, "time_per_iteration": 2.633772850036621 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105768, "balance_loss_mlp": 1.04835165, "diversity_loss_mlp": 0.0, "epoch": 0.7500961908426318, "flos": 737658233856.0, "grad_norm": 0.07310706246925956, "language_loss": 0.77907795, "learning_rate": 0.00015502882579502953, "loss": 0.78965473, "num_input_tokens_seen": 323436688, "router_z_loss_mlp": 0.09332275, "routerloss_mlp": 0.0, "step": 3899, "time_per_iteration": 2.938547372817993 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054758, "balance_loss_mlp": 1.04551327, "diversity_loss_mlp": 0.0, "epoch": 0.7502885725278954, "flos": 533400256512.0, "grad_norm": 0.06650950979385485, "language_loss": 0.8470974, "learning_rate": 0.00015480337841166592, "loss": 0.85764492, "num_input_tokens_seen": 323510032, "router_z_loss_mlp": 0.09246826, "routerloss_mlp": 0.0, "step": 3900, "time_per_iteration": 2.719611167907715 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064406, "balance_loss_mlp": 1.05532193, "diversity_loss_mlp": 0.0, "epoch": 0.7504809542131589, "flos": 589324792320.0, "grad_norm": 0.06798274648693917, "language_loss": 0.83017278, "learning_rate": 0.00015457806504551647, "loss": 0.84081692, "num_input_tokens_seen": 323588896, "router_z_loss_mlp": 0.09075928, "routerloss_mlp": 0.0, "step": 3901, "time_per_iteration": 2.815099000930786 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055292, "balance_loss_mlp": 1.04617858, "diversity_loss_mlp": 0.0, "epoch": 0.7506733358984224, "flos": 511550899200.0, "grad_norm": 0.06551967362841071, "language_loss": 0.78146368, "learning_rate": 0.0001543528857840554, "loss": 0.79201663, "num_input_tokens_seen": 323661280, "router_z_loss_mlp": 0.09112549, "routerloss_mlp": 0.0, "step": 3902, "time_per_iteration": 2.660747528076172 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105756, "balance_loss_mlp": 1.04829192, "diversity_loss_mlp": 0.0, "epoch": 0.750865717583686, "flos": 539268503040.0, "grad_norm": 0.08761977110880032, "language_loss": 0.80069476, "learning_rate": 0.000154127840714705, "loss": 0.81127042, "num_input_tokens_seen": 323739200, "router_z_loss_mlp": 0.0927124, "routerloss_mlp": 0.0, "step": 3903, "time_per_iteration": 2.791895627975464 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057287, "balance_loss_mlp": 1.04786348, "diversity_loss_mlp": 0.0, "epoch": 0.7510580992689496, "flos": 476578930176.0, "grad_norm": 0.08489214172044417, "language_loss": 0.82145894, "learning_rate": 0.00015390292992483557, "loss": 0.83203179, "num_input_tokens_seen": 323802816, "router_z_loss_mlp": 0.09423828, "routerloss_mlp": 0.0, "step": 3904, "time_per_iteration": 2.531291961669922 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058521, "balance_loss_mlp": 1.04955626, "diversity_loss_mlp": 0.0, "epoch": 0.7512504809542132, "flos": 579043800576.0, "grad_norm": 0.06641081846092535, "language_loss": 0.84235787, "learning_rate": 0.00015367815350176523, "loss": 0.85294312, "num_input_tokens_seen": 323879488, "router_z_loss_mlp": 0.08966064, "routerloss_mlp": 0.0, "step": 3905, "time_per_iteration": 2.7290806770324707 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055516, "balance_loss_mlp": 1.04627776, "diversity_loss_mlp": 0.0, "epoch": 0.7514428626394767, "flos": 418660379136.0, "grad_norm": 0.06804815402684934, "language_loss": 0.82392836, "learning_rate": 0.00015345351153275987, "loss": 0.8344835, "num_input_tokens_seen": 323944512, "router_z_loss_mlp": 0.09246826, "routerloss_mlp": 0.0, "step": 3906, "time_per_iteration": 2.530323028564453 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054875, "balance_loss_mlp": 1.04556477, "diversity_loss_mlp": 0.0, "epoch": 0.7516352443247403, "flos": 641039215104.0, "grad_norm": 0.06371304983723255, "language_loss": 0.80832905, "learning_rate": 0.00015322900410503332, "loss": 0.81887782, "num_input_tokens_seen": 324020688, "router_z_loss_mlp": 0.09301758, "routerloss_mlp": 0.0, "step": 3907, "time_per_iteration": 2.840207576751709 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062824, "balance_loss_mlp": 1.05359089, "diversity_loss_mlp": 0.0, "epoch": 0.7518276260100039, "flos": 580998168576.0, "grad_norm": 0.0661364017188776, "language_loss": 0.77996182, "learning_rate": 0.00015300463130574703, "loss": 0.79059005, "num_input_tokens_seen": 324098080, "router_z_loss_mlp": 0.09228516, "routerloss_mlp": 0.0, "step": 3908, "time_per_iteration": 2.8597986698150635 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00795371, "balance_loss_mlp": 1.3454839, "diversity_loss_mlp": 0.22311893, "epoch": 0.7520200076952674, "flos": 687342412800.0, "grad_norm": 0.027335085290279493, "language_loss": 0.81861627, "learning_rate": 0.00015278039322201033, "loss": 0.82656997, "num_input_tokens_seen": 324183968, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01107004, "step": 3909, "time_per_iteration": 2.991687774658203 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056027, "balance_loss_mlp": 1.04691339, "diversity_loss_mlp": 0.0, "epoch": 0.7522123893805309, "flos": 486439976448.0, "grad_norm": 0.07802530294793614, "language_loss": 0.79405951, "learning_rate": 0.00015255628994088004, "loss": 0.80461979, "num_input_tokens_seen": 324249568, "router_z_loss_mlp": 0.09112549, "routerloss_mlp": 0.0, "step": 3910, "time_per_iteration": 2.552389621734619 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057943, "balance_loss_mlp": 1.04875183, "diversity_loss_mlp": 0.0, "epoch": 0.7524047710657945, "flos": 818982586368.0, "grad_norm": 0.06839079088853381, "language_loss": 0.75070244, "learning_rate": 0.00015233232154936082, "loss": 0.76128185, "num_input_tokens_seen": 324345312, "router_z_loss_mlp": 0.09185791, "routerloss_mlp": 0.0, "step": 3911, "time_per_iteration": 3.2685062885284424 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060306, "balance_loss_mlp": 1.05104983, "diversity_loss_mlp": 0.0, "epoch": 0.7525971527510581, "flos": 699508763136.0, "grad_norm": 0.0742904302268966, "language_loss": 0.76248109, "learning_rate": 0.0001521084881344048, "loss": 0.77308416, "num_input_tokens_seen": 324419056, "router_z_loss_mlp": 0.0925293, "routerloss_mlp": 0.0, "step": 3912, "time_per_iteration": 2.8669307231903076 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063744, "balance_loss_mlp": 1.05449915, "diversity_loss_mlp": 0.0, "epoch": 0.7527895344363217, "flos": 633787421184.0, "grad_norm": 0.07365945451583152, "language_loss": 0.86536098, "learning_rate": 0.00015188478978291208, "loss": 0.87599838, "num_input_tokens_seen": 324490848, "router_z_loss_mlp": 0.09246826, "routerloss_mlp": 0.0, "step": 3913, "time_per_iteration": 2.8062844276428223 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060792, "balance_loss_mlp": 1.05141592, "diversity_loss_mlp": 0.0, "epoch": 0.7529819161215853, "flos": 562830322176.0, "grad_norm": 0.06964875853647617, "language_loss": 0.86198735, "learning_rate": 0.00015166122658173014, "loss": 0.87259525, "num_input_tokens_seen": 324565648, "router_z_loss_mlp": 0.09381104, "routerloss_mlp": 0.0, "step": 3914, "time_per_iteration": 2.832261085510254 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062251, "balance_loss_mlp": 1.05276752, "diversity_loss_mlp": 0.0, "epoch": 0.7531742978068487, "flos": 690665647104.0, "grad_norm": 0.07069372780846282, "language_loss": 0.88695043, "learning_rate": 0.00015143779861765332, "loss": 0.89757293, "num_input_tokens_seen": 324642832, "router_z_loss_mlp": 0.09490967, "routerloss_mlp": 0.0, "step": 3915, "time_per_iteration": 2.876596689224243 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057505, "balance_loss_mlp": 1.04845726, "diversity_loss_mlp": 0.0, "epoch": 0.7533666794921123, "flos": 681101208576.0, "grad_norm": 0.07477721009048348, "language_loss": 0.81360573, "learning_rate": 0.00015121450597742458, "loss": 0.82418078, "num_input_tokens_seen": 324718336, "router_z_loss_mlp": 0.09051514, "routerloss_mlp": 0.0, "step": 3916, "time_per_iteration": 2.83457612991333 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105545, "balance_loss_mlp": 1.04619908, "diversity_loss_mlp": 0.0, "epoch": 0.7535590611773759, "flos": 623669414400.0, "grad_norm": 0.07347506206734646, "language_loss": 0.78634655, "learning_rate": 0.00015099134874773369, "loss": 0.79690111, "num_input_tokens_seen": 324787744, "router_z_loss_mlp": 0.09246826, "routerloss_mlp": 0.0, "step": 3917, "time_per_iteration": 2.7597367763519287 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00793692, "balance_loss_mlp": 1.34194863, "diversity_loss_mlp": 0.22241086, "epoch": 0.7537514428626395, "flos": 519427842048.0, "grad_norm": 0.028776380158614775, "language_loss": 0.80358481, "learning_rate": 0.00015076832701521793, "loss": 0.81152171, "num_input_tokens_seen": 324863280, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01151239, "step": 3918, "time_per_iteration": 2.746518135070801 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050485, "balance_loss_mlp": 1.04122829, "diversity_loss_mlp": 0.0, "epoch": 0.753943824547903, "flos": 723653512704.0, "grad_norm": 0.08224807804324459, "language_loss": 0.82372093, "learning_rate": 0.000150545440866462, "loss": 0.83422583, "num_input_tokens_seen": 324949600, "router_z_loss_mlp": 0.09259033, "routerloss_mlp": 0.0, "step": 3919, "time_per_iteration": 2.986933708190918 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056343, "balance_loss_mlp": 1.047104, "diversity_loss_mlp": 0.0, "epoch": 0.7541362062331666, "flos": 437547350016.0, "grad_norm": 0.07659379290436485, "language_loss": 0.78524017, "learning_rate": 0.000150322690387998, "loss": 0.79580355, "num_input_tokens_seen": 325013808, "router_z_loss_mlp": 0.09240723, "routerloss_mlp": 0.0, "step": 3920, "time_per_iteration": 2.5535264015197754 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053379, "balance_loss_mlp": 1.04395509, "diversity_loss_mlp": 0.0, "epoch": 0.7543285879184302, "flos": 565274018304.0, "grad_norm": 0.08088787979004233, "language_loss": 0.75178206, "learning_rate": 0.00015010007566630535, "loss": 0.76231587, "num_input_tokens_seen": 325084832, "router_z_loss_mlp": 0.09417725, "routerloss_mlp": 0.0, "step": 3921, "time_per_iteration": 2.752476930618286 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052876, "balance_loss_mlp": 1.0435003, "diversity_loss_mlp": 0.0, "epoch": 0.7545209696036937, "flos": 521036416512.0, "grad_norm": 0.09066204118342673, "language_loss": 0.81410325, "learning_rate": 0.00014987759678781077, "loss": 0.82463199, "num_input_tokens_seen": 325155120, "router_z_loss_mlp": 0.09375, "routerloss_mlp": 0.0, "step": 3922, "time_per_iteration": 2.6611218452453613 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049687, "balance_loss_mlp": 1.04057336, "diversity_loss_mlp": 0.0, "epoch": 0.7547133512889573, "flos": 616066684416.0, "grad_norm": 0.07014269793522399, "language_loss": 0.82503462, "learning_rate": 0.00014965525383888795, "loss": 0.83553147, "num_input_tokens_seen": 325235632, "router_z_loss_mlp": 0.09112549, "routerloss_mlp": 0.0, "step": 3923, "time_per_iteration": 2.7689740657806396 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051594, "balance_loss_mlp": 1.04243279, "diversity_loss_mlp": 0.0, "epoch": 0.7549057329742208, "flos": 750845085696.0, "grad_norm": 0.07037901848858046, "language_loss": 0.72344971, "learning_rate": 0.00014943304690585851, "loss": 0.73396569, "num_input_tokens_seen": 325309696, "router_z_loss_mlp": 0.09161377, "routerloss_mlp": 0.0, "step": 3924, "time_per_iteration": 2.926786184310913 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050623, "balance_loss_mlp": 1.04116416, "diversity_loss_mlp": 0.0, "epoch": 0.7550981146594844, "flos": 514444276224.0, "grad_norm": 0.07074790487011906, "language_loss": 0.79134214, "learning_rate": 0.0001492109760749908, "loss": 0.80184835, "num_input_tokens_seen": 325375744, "router_z_loss_mlp": 0.09454346, "routerloss_mlp": 0.0, "step": 3925, "time_per_iteration": 2.6663551330566406 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048636, "balance_loss_mlp": 1.03920674, "diversity_loss_mlp": 0.0, "epoch": 0.755290496344748, "flos": 522009930240.0, "grad_norm": 0.06259359506310941, "language_loss": 0.79865938, "learning_rate": 0.00014898904143250002, "loss": 0.80914569, "num_input_tokens_seen": 325448384, "router_z_loss_mlp": 0.09417725, "routerloss_mlp": 0.0, "step": 3926, "time_per_iteration": 2.7111570835113525 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01007032, "balance_loss_mlp": 1.00193024, "diversity_loss_mlp": 0.0, "epoch": 0.7554828780300116, "flos": 1414615021056.0, "grad_norm": 0.018464770707338953, "language_loss": 0.75755203, "learning_rate": 0.00014876724306454886, "loss": 0.76762235, "num_input_tokens_seen": 325678672, "router_z_loss_mlp": 0.05102539, "routerloss_mlp": 0.0, "step": 3927, "time_per_iteration": 4.9247355461120605 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049551, "balance_loss_mlp": 1.04027081, "diversity_loss_mlp": 0.0, "epoch": 0.7556752597152752, "flos": 556937482752.0, "grad_norm": 0.0681788266526358, "language_loss": 0.80484271, "learning_rate": 0.0001485455810572474, "loss": 0.81533813, "num_input_tokens_seen": 325746656, "router_z_loss_mlp": 0.09277344, "routerloss_mlp": 0.0, "step": 3928, "time_per_iteration": 2.644436836242676 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050291, "balance_loss_mlp": 1.04075408, "diversity_loss_mlp": 0.0, "epoch": 0.7558676414005386, "flos": 563638279680.0, "grad_norm": 0.05891834719109388, "language_loss": 0.83858299, "learning_rate": 0.00014832405549665236, "loss": 0.84908581, "num_input_tokens_seen": 325820304, "router_z_loss_mlp": 0.09533691, "routerloss_mlp": 0.0, "step": 3929, "time_per_iteration": 2.7012484073638916 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045875, "balance_loss_mlp": 1.03651154, "diversity_loss_mlp": 0.0, "epoch": 0.7560600230858022, "flos": 561377392128.0, "grad_norm": 0.06702269562440989, "language_loss": 0.78850049, "learning_rate": 0.00014810266646876746, "loss": 0.79895926, "num_input_tokens_seen": 325895584, "router_z_loss_mlp": 0.09356689, "routerloss_mlp": 0.0, "step": 3930, "time_per_iteration": 2.768267869949341 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104949, "balance_loss_mlp": 1.0400542, "diversity_loss_mlp": 0.0, "epoch": 0.7562524047710658, "flos": 719576649216.0, "grad_norm": 0.07203252309013448, "language_loss": 0.77448905, "learning_rate": 0.00014788141405954364, "loss": 0.78498399, "num_input_tokens_seen": 325976752, "router_z_loss_mlp": 0.09429932, "routerloss_mlp": 0.0, "step": 3931, "time_per_iteration": 2.9904940128326416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047773, "balance_loss_mlp": 1.03817058, "diversity_loss_mlp": 0.0, "epoch": 0.7564447864563294, "flos": 543347937792.0, "grad_norm": 0.07800689348595595, "language_loss": 0.8509475, "learning_rate": 0.00014766029835487865, "loss": 0.86142522, "num_input_tokens_seen": 326047152, "router_z_loss_mlp": 0.09588623, "routerloss_mlp": 0.0, "step": 3932, "time_per_iteration": 2.712207078933716 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050974, "balance_loss_mlp": 1.04148519, "diversity_loss_mlp": 0.0, "epoch": 0.7566371681415929, "flos": 725805743616.0, "grad_norm": 0.09178447768332373, "language_loss": 0.79506183, "learning_rate": 0.0001474393194406173, "loss": 0.80557162, "num_input_tokens_seen": 326119056, "router_z_loss_mlp": 0.09484863, "routerloss_mlp": 0.0, "step": 3933, "time_per_iteration": 2.933224678039551 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048897, "balance_loss_mlp": 1.03937268, "diversity_loss_mlp": 0.0, "epoch": 0.7568295498268565, "flos": 576580280832.0, "grad_norm": 0.05892607400759823, "language_loss": 0.79702771, "learning_rate": 0.00014721847740255112, "loss": 0.80751669, "num_input_tokens_seen": 326196736, "router_z_loss_mlp": 0.09521484, "routerloss_mlp": 0.0, "step": 3934, "time_per_iteration": 2.826552391052246 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01003728, "balance_loss_mlp": 0.99864936, "diversity_loss_mlp": 0.0, "epoch": 0.75702193151212, "flos": 1520059903488.0, "grad_norm": 0.02131829704568505, "language_loss": 0.73911923, "learning_rate": 0.00014699777232641853, "loss": 0.74915653, "num_input_tokens_seen": 326404752, "router_z_loss_mlp": 0.05078125, "routerloss_mlp": 0.0, "step": 3935, "time_per_iteration": 4.626272439956665 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050547, "balance_loss_mlp": 1.0411061, "diversity_loss_mlp": 0.0, "epoch": 0.7572143131973836, "flos": 525471556608.0, "grad_norm": 0.08283198519893772, "language_loss": 0.78541541, "learning_rate": 0.00014677720429790526, "loss": 0.79592091, "num_input_tokens_seen": 326472832, "router_z_loss_mlp": 0.09429932, "routerloss_mlp": 0.0, "step": 3936, "time_per_iteration": 2.634308338165283 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046754, "balance_loss_mlp": 1.03724098, "diversity_loss_mlp": 0.0, "epoch": 0.7574066948826472, "flos": 550738123776.0, "grad_norm": 0.060589870954327815, "language_loss": 0.84442061, "learning_rate": 0.0001465567734026429, "loss": 0.8548882, "num_input_tokens_seen": 326546976, "router_z_loss_mlp": 0.09503174, "routerloss_mlp": 0.0, "step": 3937, "time_per_iteration": 2.716531276702881 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051246, "balance_loss_mlp": 1.04150677, "diversity_loss_mlp": 0.0, "epoch": 0.7575990765679107, "flos": 395899176960.0, "grad_norm": 0.08803792614427135, "language_loss": 0.82826757, "learning_rate": 0.00014633647972621034, "loss": 0.83878005, "num_input_tokens_seen": 326609296, "router_z_loss_mlp": 0.09729004, "routerloss_mlp": 0.0, "step": 3938, "time_per_iteration": 2.4589834213256836 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053327, "balance_loss_mlp": 1.04381418, "diversity_loss_mlp": 0.0, "epoch": 0.7577914582531743, "flos": 585030615552.0, "grad_norm": 0.07008474871833649, "language_loss": 0.86420083, "learning_rate": 0.00014611632335413354, "loss": 0.87473404, "num_input_tokens_seen": 326687168, "router_z_loss_mlp": 0.09509277, "routerloss_mlp": 0.0, "step": 3939, "time_per_iteration": 2.7953155040740967 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055545, "balance_loss_mlp": 1.04597211, "diversity_loss_mlp": 0.0, "epoch": 0.7579838399384379, "flos": 820979172864.0, "grad_norm": 0.06005420836927303, "language_loss": 0.82715803, "learning_rate": 0.00014589630437188456, "loss": 0.83771348, "num_input_tokens_seen": 326777760, "router_z_loss_mlp": 0.09570312, "routerloss_mlp": 0.0, "step": 3940, "time_per_iteration": 3.1720919609069824 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056474, "balance_loss_mlp": 1.04727697, "diversity_loss_mlp": 0.0, "epoch": 0.7581762216237015, "flos": 443892441600.0, "grad_norm": 0.07556117037580423, "language_loss": 0.78885162, "learning_rate": 0.00014567642286488253, "loss": 0.7994163, "num_input_tokens_seen": 326843952, "router_z_loss_mlp": 0.09185791, "routerloss_mlp": 0.0, "step": 3941, "time_per_iteration": 2.5224215984344482 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105497, "balance_loss_mlp": 1.0453198, "diversity_loss_mlp": 0.0, "epoch": 0.7583686033089649, "flos": 540886989312.0, "grad_norm": 0.10380533878684198, "language_loss": 0.79189527, "learning_rate": 0.00014545667891849258, "loss": 0.80244499, "num_input_tokens_seen": 326911296, "router_z_loss_mlp": 0.09649658, "routerloss_mlp": 0.0, "step": 3942, "time_per_iteration": 2.6196579933166504 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056634, "balance_loss_mlp": 1.04717493, "diversity_loss_mlp": 0.0, "epoch": 0.7585609849942285, "flos": 522588091392.0, "grad_norm": 0.06980232416240703, "language_loss": 0.82745945, "learning_rate": 0.00014523707261802733, "loss": 0.83802581, "num_input_tokens_seen": 326977776, "router_z_loss_mlp": 0.09448242, "routerloss_mlp": 0.0, "step": 3943, "time_per_iteration": 2.652625799179077 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00794094, "balance_loss_mlp": 1.34365344, "diversity_loss_mlp": 0.22232203, "epoch": 0.7587533666794921, "flos": 541860503040.0, "grad_norm": 0.034795977662747106, "language_loss": 0.81799769, "learning_rate": 0.00014501760404874527, "loss": 0.82593858, "num_input_tokens_seen": 327050240, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01110633, "step": 3944, "time_per_iteration": 2.7529001235961914 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059832, "balance_loss_mlp": 1.05071235, "diversity_loss_mlp": 0.0, "epoch": 0.7589457483647557, "flos": 606408270336.0, "grad_norm": 0.07566953086997541, "language_loss": 0.85807776, "learning_rate": 0.00014479827329585176, "loss": 0.86867607, "num_input_tokens_seen": 327119952, "router_z_loss_mlp": 0.09112549, "routerloss_mlp": 0.0, "step": 3945, "time_per_iteration": 2.701622486114502 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051833, "balance_loss_mlp": 1.04233766, "diversity_loss_mlp": 0.0, "epoch": 0.7591381300500193, "flos": 555106452480.0, "grad_norm": 0.05933089648069645, "language_loss": 0.84881538, "learning_rate": 0.00014457908044449846, "loss": 0.85933375, "num_input_tokens_seen": 327192640, "router_z_loss_mlp": 0.09484863, "routerloss_mlp": 0.0, "step": 3946, "time_per_iteration": 2.728095769882202 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00787034, "balance_loss_mlp": 1.32538223, "diversity_loss_mlp": 0.22601989, "epoch": 0.7593305117352828, "flos": 529681669632.0, "grad_norm": 0.02987157443530754, "language_loss": 0.83105904, "learning_rate": 0.00014436002557978371, "loss": 0.83892936, "num_input_tokens_seen": 327271008, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.011333, "step": 3947, "time_per_iteration": 2.8229527473449707 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01009615, "balance_loss_mlp": 1.00491834, "diversity_loss_mlp": 0.0, "epoch": 0.7595228934205464, "flos": 1502798759424.0, "grad_norm": 0.009520189474687826, "language_loss": 0.76643145, "learning_rate": 0.00014414110878675201, "loss": 0.77652764, "num_input_tokens_seen": 327505392, "router_z_loss_mlp": 0.046875, "routerloss_mlp": 0.0, "step": 3948, "time_per_iteration": 6.289541482925415 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060096, "balance_loss_mlp": 1.05072582, "diversity_loss_mlp": 0.0, "epoch": 0.7597152751058099, "flos": 455525047296.0, "grad_norm": 0.06379991139513626, "language_loss": 0.79987645, "learning_rate": 0.0001439223301503945, "loss": 0.8104775, "num_input_tokens_seen": 327569392, "router_z_loss_mlp": 0.09362793, "routerloss_mlp": 0.0, "step": 3949, "time_per_iteration": 2.4896605014801025 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063286, "balance_loss_mlp": 1.05441725, "diversity_loss_mlp": 0.0, "epoch": 0.7599076567910735, "flos": 685466966016.0, "grad_norm": 0.07443357695534152, "language_loss": 0.75937033, "learning_rate": 0.00014370368975564834, "loss": 0.7700032, "num_input_tokens_seen": 327648304, "router_z_loss_mlp": 0.08874512, "routerloss_mlp": 0.0, "step": 3950, "time_per_iteration": 2.939652442932129 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062595, "balance_loss_mlp": 1.05339789, "diversity_loss_mlp": 0.0, "epoch": 0.760100038476337, "flos": 532372414464.0, "grad_norm": 0.07225326310483449, "language_loss": 0.83501256, "learning_rate": 0.00014348518768739766, "loss": 0.84563851, "num_input_tokens_seen": 327725600, "router_z_loss_mlp": 0.09191895, "routerloss_mlp": 0.0, "step": 3951, "time_per_iteration": 2.760315179824829 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01013895, "balance_loss_mlp": 1.00924563, "diversity_loss_mlp": 0.0, "epoch": 0.7602924201616006, "flos": 1471742866944.0, "grad_norm": 0.01015881799745275, "language_loss": 0.7672804, "learning_rate": 0.00014326682403047243, "loss": 0.77741933, "num_input_tokens_seen": 327954048, "router_z_loss_mlp": 0.04638672, "routerloss_mlp": 0.0, "step": 3952, "time_per_iteration": 4.8084025382995605 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106276, "balance_loss_mlp": 1.05365205, "diversity_loss_mlp": 0.0, "epoch": 0.7604848018468642, "flos": 774631558656.0, "grad_norm": 0.06460876756714844, "language_loss": 0.86549526, "learning_rate": 0.00014304859886964867, "loss": 0.87612283, "num_input_tokens_seen": 328034656, "router_z_loss_mlp": 0.09106445, "routerloss_mlp": 0.0, "step": 3953, "time_per_iteration": 2.9919626712799072 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065802, "balance_loss_mlp": 1.05655789, "diversity_loss_mlp": 0.0, "epoch": 0.7606771835321278, "flos": 558185209344.0, "grad_norm": 0.06531272999026969, "language_loss": 0.83625901, "learning_rate": 0.00014283051228964878, "loss": 0.84691703, "num_input_tokens_seen": 328107264, "router_z_loss_mlp": 0.09234619, "routerloss_mlp": 0.0, "step": 3954, "time_per_iteration": 2.7195558547973633 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060232, "balance_loss_mlp": 1.05114245, "diversity_loss_mlp": 0.0, "epoch": 0.7608695652173914, "flos": 525397404672.0, "grad_norm": 0.06973579873696066, "language_loss": 0.82862848, "learning_rate": 0.00014261256437514197, "loss": 0.83923078, "num_input_tokens_seen": 328177168, "router_z_loss_mlp": 0.09088135, "routerloss_mlp": 0.0, "step": 3955, "time_per_iteration": 2.6542091369628906 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00794195, "balance_loss_mlp": 1.3411088, "diversity_loss_mlp": 0.22477356, "epoch": 0.7610619469026548, "flos": 615038842368.0, "grad_norm": 0.03401627820018092, "language_loss": 0.82645166, "learning_rate": 0.0001423947552107428, "loss": 0.83439362, "num_input_tokens_seen": 328245360, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0112533, "step": 3956, "time_per_iteration": 2.7648067474365234 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062618, "balance_loss_mlp": 1.05335546, "diversity_loss_mlp": 0.0, "epoch": 0.7612543285879184, "flos": 863356382208.0, "grad_norm": 0.06632119476384091, "language_loss": 0.77184016, "learning_rate": 0.00014217708488101243, "loss": 0.78246629, "num_input_tokens_seen": 328326560, "router_z_loss_mlp": 0.0925293, "routerloss_mlp": 0.0, "step": 3957, "time_per_iteration": 3.1002120971679688 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064244, "balance_loss_mlp": 1.05514848, "diversity_loss_mlp": 0.0, "epoch": 0.761446710273182, "flos": 553658664960.0, "grad_norm": 0.08639703813163502, "language_loss": 0.77281177, "learning_rate": 0.0001419595534704579, "loss": 0.78345418, "num_input_tokens_seen": 328395760, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 3958, "time_per_iteration": 2.7124218940734863 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062721, "balance_loss_mlp": 1.05369043, "diversity_loss_mlp": 0.0, "epoch": 0.7616390919584456, "flos": 467350373376.0, "grad_norm": 0.06838082339011158, "language_loss": 0.81229275, "learning_rate": 0.00014174216106353237, "loss": 0.82291996, "num_input_tokens_seen": 328464560, "router_z_loss_mlp": 0.09039307, "routerloss_mlp": 0.0, "step": 3959, "time_per_iteration": 2.628516912460327 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060156, "balance_loss_mlp": 1.05085802, "diversity_loss_mlp": 0.0, "epoch": 0.7618314736437091, "flos": 498430858752.0, "grad_norm": 0.07205328766008003, "language_loss": 0.76858711, "learning_rate": 0.00014152490774463512, "loss": 0.77918863, "num_input_tokens_seen": 328532640, "router_z_loss_mlp": 0.09295654, "routerloss_mlp": 0.0, "step": 3960, "time_per_iteration": 2.630159854888916 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0106295, "balance_loss_mlp": 1.05382431, "diversity_loss_mlp": 0.0, "epoch": 0.7620238553289727, "flos": 434545316352.0, "grad_norm": 0.0819861529910791, "language_loss": 0.87198371, "learning_rate": 0.00014130779359811135, "loss": 0.88261318, "num_input_tokens_seen": 328595392, "router_z_loss_mlp": 0.09124756, "routerloss_mlp": 0.0, "step": 3961, "time_per_iteration": 2.464413642883301 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058979, "balance_loss_mlp": 1.04990077, "diversity_loss_mlp": 0.0, "epoch": 0.7622162370142362, "flos": 664277262336.0, "grad_norm": 0.07245892571162069, "language_loss": 0.85946453, "learning_rate": 0.0001410908187082521, "loss": 0.87005424, "num_input_tokens_seen": 328676368, "router_z_loss_mlp": 0.09082031, "routerloss_mlp": 0.0, "step": 3962, "time_per_iteration": 2.921780586242676 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058262, "balance_loss_mlp": 1.04887986, "diversity_loss_mlp": 0.0, "epoch": 0.7624086186994998, "flos": 557965324800.0, "grad_norm": 0.06688462156779182, "language_loss": 0.83390021, "learning_rate": 0.0001408739831592949, "loss": 0.84448284, "num_input_tokens_seen": 328745136, "router_z_loss_mlp": 0.09393311, "routerloss_mlp": 0.0, "step": 3963, "time_per_iteration": 2.6833889484405518 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060293, "balance_loss_mlp": 1.05104804, "diversity_loss_mlp": 0.0, "epoch": 0.7626010003847634, "flos": 629132396544.0, "grad_norm": 0.0755930480675871, "language_loss": 0.77544367, "learning_rate": 0.0001406572870354224, "loss": 0.7860465, "num_input_tokens_seen": 328820384, "router_z_loss_mlp": 0.09246826, "routerloss_mlp": 0.0, "step": 3964, "time_per_iteration": 2.7871947288513184 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060093, "balance_loss_mlp": 1.05084801, "diversity_loss_mlp": 0.0, "epoch": 0.7627933820700269, "flos": 437942702592.0, "grad_norm": 0.06988595261199848, "language_loss": 0.86813599, "learning_rate": 0.00014044073042076337, "loss": 0.87873685, "num_input_tokens_seen": 328884976, "router_z_loss_mlp": 0.0925293, "routerloss_mlp": 0.0, "step": 3965, "time_per_iteration": 2.4948155879974365 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064783, "balance_loss_mlp": 1.0558666, "diversity_loss_mlp": 0.0, "epoch": 0.7629857637552905, "flos": 532723350528.0, "grad_norm": 0.053016831320737375, "language_loss": 0.88845956, "learning_rate": 0.00014022431339939302, "loss": 0.8991074, "num_input_tokens_seen": 328957792, "router_z_loss_mlp": 0.08917236, "routerloss_mlp": 0.0, "step": 3966, "time_per_iteration": 2.673383951187134 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057572, "balance_loss_mlp": 1.04824972, "diversity_loss_mlp": 0.0, "epoch": 0.7631781454405541, "flos": 680036290560.0, "grad_norm": 0.09057872820095057, "language_loss": 0.7816959, "learning_rate": 0.00014000803605533163, "loss": 0.79227161, "num_input_tokens_seen": 329034960, "router_z_loss_mlp": 0.09320068, "routerloss_mlp": 0.0, "step": 3967, "time_per_iteration": 2.8631951808929443 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057314, "balance_loss_mlp": 1.04857016, "diversity_loss_mlp": 0.0, "epoch": 0.7633705271258177, "flos": 507493859328.0, "grad_norm": 0.08630668575925342, "language_loss": 0.84042531, "learning_rate": 0.00013979189847254553, "loss": 0.85099846, "num_input_tokens_seen": 329100848, "router_z_loss_mlp": 0.08752441, "routerloss_mlp": 0.0, "step": 3968, "time_per_iteration": 2.5586295127868652 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057911, "balance_loss_mlp": 1.04832053, "diversity_loss_mlp": 0.0, "epoch": 0.7635629088110811, "flos": 618866085888.0, "grad_norm": 0.07119073500769035, "language_loss": 0.80335605, "learning_rate": 0.00013957590073494674, "loss": 0.81393516, "num_input_tokens_seen": 329181120, "router_z_loss_mlp": 0.09576416, "routerloss_mlp": 0.0, "step": 3969, "time_per_iteration": 2.785759449005127 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055351, "balance_loss_mlp": 1.0460887, "diversity_loss_mlp": 0.0, "epoch": 0.7637552904963447, "flos": 638425193472.0, "grad_norm": 0.0691753234001315, "language_loss": 0.78865349, "learning_rate": 0.0001393600429263931, "loss": 0.79920697, "num_input_tokens_seen": 329249888, "router_z_loss_mlp": 0.0927124, "routerloss_mlp": 0.0, "step": 3970, "time_per_iteration": 2.7582993507385254 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01013524, "balance_loss_mlp": 1.00873148, "diversity_loss_mlp": 0.0, "epoch": 0.7639476721816083, "flos": 1563222302208.0, "grad_norm": 0.011908325756944461, "language_loss": 0.74744886, "learning_rate": 0.00013914432513068792, "loss": 0.7575841, "num_input_tokens_seen": 329483824, "router_z_loss_mlp": 0.04785156, "routerloss_mlp": 0.0, "step": 3971, "time_per_iteration": 4.944155693054199 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051128, "balance_loss_mlp": 1.04182386, "diversity_loss_mlp": 0.0, "epoch": 0.7641400538668719, "flos": 495987162624.0, "grad_norm": 0.07417078530438988, "language_loss": 0.81570405, "learning_rate": 0.0001389287474315804, "loss": 0.82621539, "num_input_tokens_seen": 329553536, "router_z_loss_mlp": 0.09307861, "routerloss_mlp": 0.0, "step": 3972, "time_per_iteration": 2.6553244590759277 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052519, "balance_loss_mlp": 1.04347086, "diversity_loss_mlp": 0.0, "epoch": 0.7643324355521355, "flos": 578441046528.0, "grad_norm": 0.05487535888911553, "language_loss": 0.79840803, "learning_rate": 0.00013871330991276505, "loss": 0.8089332, "num_input_tokens_seen": 329621856, "router_z_loss_mlp": 0.0904541, "routerloss_mlp": 0.0, "step": 3973, "time_per_iteration": 2.681697368621826 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052207, "balance_loss_mlp": 1.0428077, "diversity_loss_mlp": 0.0, "epoch": 0.764524817237399, "flos": 784823717376.0, "grad_norm": 0.08960984364762024, "language_loss": 0.80946076, "learning_rate": 0.00013849801265788247, "loss": 0.81998283, "num_input_tokens_seen": 329708192, "router_z_loss_mlp": 0.09399414, "routerloss_mlp": 0.0, "step": 3974, "time_per_iteration": 3.0523104667663574 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00796632, "balance_loss_mlp": 1.34598541, "diversity_loss_mlp": 0.22497699, "epoch": 0.7647171989226625, "flos": 526279514112.0, "grad_norm": 0.033347453631336434, "language_loss": 0.83125114, "learning_rate": 0.00013828285575051818, "loss": 0.83921754, "num_input_tokens_seen": 329774704, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01115073, "step": 3975, "time_per_iteration": 2.631014108657837 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052026, "balance_loss_mlp": 1.04301977, "diversity_loss_mlp": 0.0, "epoch": 0.7649095806079261, "flos": 554876656128.0, "grad_norm": 0.06872239671854397, "language_loss": 0.84060633, "learning_rate": 0.0001380678392742035, "loss": 0.85112655, "num_input_tokens_seen": 329846432, "router_z_loss_mlp": 0.09014893, "routerloss_mlp": 0.0, "step": 3976, "time_per_iteration": 2.710768938064575 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050013, "balance_loss_mlp": 1.04042244, "diversity_loss_mlp": 0.0, "epoch": 0.7651019622931897, "flos": 649145954304.0, "grad_norm": 0.05722299510673748, "language_loss": 0.84721446, "learning_rate": 0.00013785296331241526, "loss": 0.85771459, "num_input_tokens_seen": 329926336, "router_z_loss_mlp": 0.09588623, "routerloss_mlp": 0.0, "step": 3977, "time_per_iteration": 2.863175868988037 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049924, "balance_loss_mlp": 1.04060829, "diversity_loss_mlp": 0.0, "epoch": 0.7652943439784533, "flos": 1046449248768.0, "grad_norm": 0.0690026214963165, "language_loss": 0.87410915, "learning_rate": 0.00013763822794857583, "loss": 0.88460839, "num_input_tokens_seen": 330009536, "router_z_loss_mlp": 0.09320068, "routerloss_mlp": 0.0, "step": 3978, "time_per_iteration": 3.3100810050964355 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049847, "balance_loss_mlp": 1.04050136, "diversity_loss_mlp": 0.0, "epoch": 0.7654867256637168, "flos": 504350862336.0, "grad_norm": 0.06632607852839086, "language_loss": 0.90003061, "learning_rate": 0.00013742363326605278, "loss": 0.91052908, "num_input_tokens_seen": 330083264, "router_z_loss_mlp": 0.09344482, "routerloss_mlp": 0.0, "step": 3979, "time_per_iteration": 2.754115581512451 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052053, "balance_loss_mlp": 1.04258752, "diversity_loss_mlp": 0.0, "epoch": 0.7656791073489804, "flos": 574709976576.0, "grad_norm": 0.059791344398012564, "language_loss": 0.78432417, "learning_rate": 0.00013720917934815935, "loss": 0.79484463, "num_input_tokens_seen": 330157120, "router_z_loss_mlp": 0.09466553, "routerloss_mlp": 0.0, "step": 3980, "time_per_iteration": 2.801797866821289 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053778, "balance_loss_mlp": 1.04425907, "diversity_loss_mlp": 0.0, "epoch": 0.765871489034244, "flos": 492812232192.0, "grad_norm": 0.08312893208703641, "language_loss": 0.82967758, "learning_rate": 0.00013699486627815344, "loss": 0.84021544, "num_input_tokens_seen": 330224560, "router_z_loss_mlp": 0.09509277, "routerloss_mlp": 0.0, "step": 3981, "time_per_iteration": 2.6589224338531494 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052365, "balance_loss_mlp": 1.04295897, "diversity_loss_mlp": 0.0, "epoch": 0.7660638707195075, "flos": 486024800256.0, "grad_norm": 0.07260212580199023, "language_loss": 0.82633436, "learning_rate": 0.00013678069413923928, "loss": 0.83685803, "num_input_tokens_seen": 330292000, "router_z_loss_mlp": 0.09399414, "routerloss_mlp": 0.0, "step": 3982, "time_per_iteration": 2.6876726150512695 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054538, "balance_loss_mlp": 1.0454247, "diversity_loss_mlp": 0.0, "epoch": 0.766256252404771, "flos": 444295134720.0, "grad_norm": 0.060912508562222696, "language_loss": 0.81971568, "learning_rate": 0.00013656666301456555, "loss": 0.83026105, "num_input_tokens_seen": 330357472, "router_z_loss_mlp": 0.09106445, "routerloss_mlp": 0.0, "step": 3983, "time_per_iteration": 2.547969341278076 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051422, "balance_loss_mlp": 1.04195666, "diversity_loss_mlp": 0.0, "epoch": 0.7664486340900346, "flos": 485179766784.0, "grad_norm": 0.07203556219041155, "language_loss": 0.84272242, "learning_rate": 0.0001363527729872267, "loss": 0.85323668, "num_input_tokens_seen": 330427792, "router_z_loss_mlp": 0.09472656, "routerloss_mlp": 0.0, "step": 3984, "time_per_iteration": 2.638418197631836 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052921, "balance_loss_mlp": 1.04378974, "diversity_loss_mlp": 0.0, "epoch": 0.7666410157752982, "flos": 646200820224.0, "grad_norm": 0.06683426358110046, "language_loss": 0.76389247, "learning_rate": 0.00013613902414026207, "loss": 0.77442169, "num_input_tokens_seen": 330500320, "router_z_loss_mlp": 0.09124756, "routerloss_mlp": 0.0, "step": 3985, "time_per_iteration": 2.7989237308502197 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055274, "balance_loss_mlp": 1.04588056, "diversity_loss_mlp": 0.0, "epoch": 0.7668333974605618, "flos": 774303017472.0, "grad_norm": 0.07515257411295292, "language_loss": 0.82508516, "learning_rate": 0.00013592541655665642, "loss": 0.83563781, "num_input_tokens_seen": 330581696, "router_z_loss_mlp": 0.09393311, "routerloss_mlp": 0.0, "step": 3986, "time_per_iteration": 3.015293836593628 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105591, "balance_loss_mlp": 1.04635525, "diversity_loss_mlp": 0.0, "epoch": 0.7670257791458254, "flos": 613462574592.0, "grad_norm": 0.07774054250244124, "language_loss": 0.85269868, "learning_rate": 0.00013571195031933947, "loss": 0.86325783, "num_input_tokens_seen": 330648000, "router_z_loss_mlp": 0.09552002, "routerloss_mlp": 0.0, "step": 3987, "time_per_iteration": 2.6980810165405273 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01010581, "balance_loss_mlp": 1.0057168, "diversity_loss_mlp": 0.0, "epoch": 0.7672181608310888, "flos": 1485357378048.0, "grad_norm": 0.012742252799641985, "language_loss": 0.80481339, "learning_rate": 0.00013549862551118626, "loss": 0.81491923, "num_input_tokens_seen": 330873872, "router_z_loss_mlp": 0.04858398, "routerloss_mlp": 0.0, "step": 3988, "time_per_iteration": 4.809666156768799 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049905, "balance_loss_mlp": 1.04043365, "diversity_loss_mlp": 0.0, "epoch": 0.7674105425163524, "flos": 610732182528.0, "grad_norm": 0.07424799958173026, "language_loss": 0.85590923, "learning_rate": 0.00013528544221501655, "loss": 0.86640829, "num_input_tokens_seen": 330945760, "router_z_loss_mlp": 0.09460449, "routerloss_mlp": 0.0, "step": 3989, "time_per_iteration": 2.7649118900299072 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010579, "balance_loss_mlp": 1.04848218, "diversity_loss_mlp": 0.0, "epoch": 0.767602924201616, "flos": 845205788160.0, "grad_norm": 0.07001972276723446, "language_loss": 0.81763613, "learning_rate": 0.00013507240051359586, "loss": 0.82821512, "num_input_tokens_seen": 331025584, "router_z_loss_mlp": 0.09405518, "routerloss_mlp": 0.0, "step": 3990, "time_per_iteration": 3.0377867221832275 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057165, "balance_loss_mlp": 1.04797447, "diversity_loss_mlp": 0.0, "epoch": 0.7677953058868796, "flos": 527114635776.0, "grad_norm": 0.07160878890290734, "language_loss": 0.86059034, "learning_rate": 0.00013485950048963425, "loss": 0.87116206, "num_input_tokens_seen": 331093008, "router_z_loss_mlp": 0.09185791, "routerloss_mlp": 0.0, "step": 3991, "time_per_iteration": 2.5790224075317383 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105318, "balance_loss_mlp": 1.04409003, "diversity_loss_mlp": 0.0, "epoch": 0.7679876875721431, "flos": 923550501888.0, "grad_norm": 0.0667031946156718, "language_loss": 0.82767689, "learning_rate": 0.00013464674222578643, "loss": 0.83820868, "num_input_tokens_seen": 331177120, "router_z_loss_mlp": 0.09100342, "routerloss_mlp": 0.0, "step": 3992, "time_per_iteration": 3.201578140258789 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057061, "balance_loss_mlp": 1.04791176, "diversity_loss_mlp": 0.0, "epoch": 0.7681800692574067, "flos": 458087311872.0, "grad_norm": 0.08569609854575283, "language_loss": 0.83404213, "learning_rate": 0.00013443412580465292, "loss": 0.84461272, "num_input_tokens_seen": 331245424, "router_z_loss_mlp": 0.09155273, "routerloss_mlp": 0.0, "step": 3993, "time_per_iteration": 2.5704004764556885 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050595, "balance_loss_mlp": 1.04118383, "diversity_loss_mlp": 0.0, "epoch": 0.7683724509426703, "flos": 658436179968.0, "grad_norm": 0.0673936052155154, "language_loss": 0.83964813, "learning_rate": 0.00013422165130877857, "loss": 0.85015404, "num_input_tokens_seen": 331327504, "router_z_loss_mlp": 0.09405518, "routerloss_mlp": 0.0, "step": 3994, "time_per_iteration": 2.9138286113739014 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057424, "balance_loss_mlp": 1.0483048, "diversity_loss_mlp": 0.0, "epoch": 0.7685648326279338, "flos": 555284491776.0, "grad_norm": 0.07281784593119212, "language_loss": 0.8049981, "learning_rate": 0.00013400931882065327, "loss": 0.81557238, "num_input_tokens_seen": 331398464, "router_z_loss_mlp": 0.09124756, "routerloss_mlp": 0.0, "step": 3995, "time_per_iteration": 2.6342077255249023 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055278, "balance_loss_mlp": 1.04585409, "diversity_loss_mlp": 0.0, "epoch": 0.7687572143131974, "flos": 687404081664.0, "grad_norm": 0.062093519620885704, "language_loss": 0.80842459, "learning_rate": 0.0001337971284227118, "loss": 0.81897736, "num_input_tokens_seen": 331484592, "router_z_loss_mlp": 0.09411621, "routerloss_mlp": 0.0, "step": 3996, "time_per_iteration": 3.0022008419036865 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01004861, "balance_loss_mlp": 1.00011611, "diversity_loss_mlp": 0.0, "epoch": 0.7689495959984609, "flos": 1489453691904.0, "grad_norm": 0.007312606829584695, "language_loss": 0.76118422, "learning_rate": 0.00013358508019733388, "loss": 0.77123284, "num_input_tokens_seen": 331721360, "router_z_loss_mlp": 0.04736328, "routerloss_mlp": 0.0, "step": 3997, "time_per_iteration": 4.911606311798096 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055259, "balance_loss_mlp": 1.04605579, "diversity_loss_mlp": 0.0, "epoch": 0.7691419776837245, "flos": 570405888000.0, "grad_norm": 0.06973120075241693, "language_loss": 0.8046248, "learning_rate": 0.0001333731742268438, "loss": 0.81517738, "num_input_tokens_seen": 331794240, "router_z_loss_mlp": 0.09204102, "routerloss_mlp": 0.0, "step": 3998, "time_per_iteration": 2.683593273162842 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053524, "balance_loss_mlp": 1.0442791, "diversity_loss_mlp": 0.0, "epoch": 0.7693343593689881, "flos": 520087495680.0, "grad_norm": 0.0765354269800423, "language_loss": 0.85693717, "learning_rate": 0.0001331614105935109, "loss": 0.86747241, "num_input_tokens_seen": 331866496, "router_z_loss_mlp": 0.09246826, "routerloss_mlp": 0.0, "step": 3999, "time_per_iteration": 2.675220489501953 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054062, "balance_loss_mlp": 1.04481769, "diversity_loss_mlp": 0.0, "epoch": 0.7695267410542517, "flos": 660378438144.0, "grad_norm": 0.06349178277774252, "language_loss": 0.84176111, "learning_rate": 0.00013294978937954883, "loss": 0.85230172, "num_input_tokens_seen": 331936592, "router_z_loss_mlp": 0.0925293, "routerloss_mlp": 0.0, "step": 4000, "time_per_iteration": 2.8622941970825195 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054979, "balance_loss_mlp": 1.04558492, "diversity_loss_mlp": 0.0, "epoch": 0.7697191227395151, "flos": 546809564160.0, "grad_norm": 0.09234703224205486, "language_loss": 0.85414779, "learning_rate": 0.00013273831066711655, "loss": 0.86469758, "num_input_tokens_seen": 332003536, "router_z_loss_mlp": 0.09393311, "routerloss_mlp": 0.0, "step": 4001, "time_per_iteration": 2.6298534870147705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052476, "balance_loss_mlp": 1.04325461, "diversity_loss_mlp": 0.0, "epoch": 0.7699115044247787, "flos": 540610205184.0, "grad_norm": 0.06055695533202859, "language_loss": 0.79907209, "learning_rate": 0.00013252697453831747, "loss": 0.8095969, "num_input_tokens_seen": 332075248, "router_z_loss_mlp": 0.09222412, "routerloss_mlp": 0.0, "step": 4002, "time_per_iteration": 2.692922830581665 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047306, "balance_loss_mlp": 1.03798985, "diversity_loss_mlp": 0.0, "epoch": 0.7701038861100423, "flos": 562936407552.0, "grad_norm": 0.06495740089460322, "language_loss": 0.82613641, "learning_rate": 0.00013231578107519916, "loss": 0.83660942, "num_input_tokens_seen": 332158944, "router_z_loss_mlp": 0.09307861, "routerloss_mlp": 0.0, "step": 4003, "time_per_iteration": 2.9229555130004883 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049706, "balance_loss_mlp": 1.04049134, "diversity_loss_mlp": 0.0, "epoch": 0.7702962677953059, "flos": 481737964032.0, "grad_norm": 0.07621650724161941, "language_loss": 0.82803172, "learning_rate": 0.00013210473035975422, "loss": 0.83852881, "num_input_tokens_seen": 332226368, "router_z_loss_mlp": 0.09210205, "routerloss_mlp": 0.0, "step": 4004, "time_per_iteration": 2.569532632827759 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050417, "balance_loss_mlp": 1.04116035, "diversity_loss_mlp": 0.0, "epoch": 0.7704886494805695, "flos": 770389138944.0, "grad_norm": 0.07296352629436301, "language_loss": 0.85812414, "learning_rate": 0.0001318938224739201, "loss": 0.86862826, "num_input_tokens_seen": 332314784, "router_z_loss_mlp": 0.0925293, "routerloss_mlp": 0.0, "step": 4005, "time_per_iteration": 3.0234341621398926 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049929, "balance_loss_mlp": 1.04063106, "diversity_loss_mlp": 0.0, "epoch": 0.770681031165833, "flos": 601192336896.0, "grad_norm": 0.06528825004105314, "language_loss": 0.83766401, "learning_rate": 0.00013168305749957843, "loss": 0.84816337, "num_input_tokens_seen": 332387952, "router_z_loss_mlp": 0.09301758, "routerloss_mlp": 0.0, "step": 4006, "time_per_iteration": 2.733548641204834 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00790765, "balance_loss_mlp": 1.33768153, "diversity_loss_mlp": 0.22157404, "epoch": 0.7708734128510966, "flos": 496108302336.0, "grad_norm": 0.030772470198916744, "language_loss": 0.82874978, "learning_rate": 0.00013147243551855532, "loss": 0.8366574, "num_input_tokens_seen": 332456352, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01113757, "step": 4007, "time_per_iteration": 2.6124446392059326 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049736, "balance_loss_mlp": 1.0404737, "diversity_loss_mlp": 0.0, "epoch": 0.7710657945363601, "flos": 567299966976.0, "grad_norm": 0.05859111752284866, "language_loss": 0.80677342, "learning_rate": 0.00013126195661262148, "loss": 0.81727076, "num_input_tokens_seen": 332534288, "router_z_loss_mlp": 0.09259033, "routerloss_mlp": 0.0, "step": 4008, "time_per_iteration": 2.7372946739196777 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052041, "balance_loss_mlp": 1.04286766, "diversity_loss_mlp": 0.0, "epoch": 0.7712581762216237, "flos": 604550075904.0, "grad_norm": 0.06950402202343967, "language_loss": 0.86921602, "learning_rate": 0.00013105162086349216, "loss": 0.87973642, "num_input_tokens_seen": 332615440, "router_z_loss_mlp": 0.09173584, "routerloss_mlp": 0.0, "step": 4009, "time_per_iteration": 2.825164556503296 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050421, "balance_loss_mlp": 1.04102159, "diversity_loss_mlp": 0.0, "epoch": 0.7714505579068872, "flos": 530894891520.0, "grad_norm": 0.05664497988696294, "language_loss": 0.85945249, "learning_rate": 0.00013084142835282687, "loss": 0.86995667, "num_input_tokens_seen": 332687360, "router_z_loss_mlp": 0.09381104, "routerloss_mlp": 0.0, "step": 4010, "time_per_iteration": 2.6627306938171387 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00590218, "balance_loss_mlp": 1.02735484, "diversity_loss_mlp": 0.13424492, "epoch": 0.7716429395921508, "flos": 1422205267968.0, "grad_norm": 0.0012430140076356488, "language_loss": 0.79884362, "learning_rate": 0.00013063137916222956, "loss": 0.80474579, "num_input_tokens_seen": 332919936, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.00941846, "step": 4011, "time_per_iteration": 4.808507919311523 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050884, "balance_loss_mlp": 1.04154992, "diversity_loss_mlp": 0.0, "epoch": 0.7718353212774144, "flos": 578428563456.0, "grad_norm": 0.062052307609784016, "language_loss": 0.89290094, "learning_rate": 0.0001304214733732485, "loss": 0.90340984, "num_input_tokens_seen": 332990096, "router_z_loss_mlp": 0.09338379, "routerloss_mlp": 0.0, "step": 4012, "time_per_iteration": 2.7328708171844482 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105349, "balance_loss_mlp": 1.04380453, "diversity_loss_mlp": 0.0, "epoch": 0.772027702962678, "flos": 510742941696.0, "grad_norm": 0.07734543299334512, "language_loss": 0.82669097, "learning_rate": 0.00013021171106737672, "loss": 0.83722585, "num_input_tokens_seen": 333063616, "router_z_loss_mlp": 0.09686279, "routerloss_mlp": 0.0, "step": 4013, "time_per_iteration": 2.6573734283447266 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049877, "balance_loss_mlp": 1.04070377, "diversity_loss_mlp": 0.0, "epoch": 0.7722200846479416, "flos": 525661705728.0, "grad_norm": 0.06603423132938777, "language_loss": 0.80092031, "learning_rate": 0.00013000209232605071, "loss": 0.81141913, "num_input_tokens_seen": 333136368, "router_z_loss_mlp": 0.09173584, "routerloss_mlp": 0.0, "step": 4014, "time_per_iteration": 2.717602014541626 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053571, "balance_loss_mlp": 1.04388535, "diversity_loss_mlp": 0.0, "epoch": 0.772412466333205, "flos": 479598216192.0, "grad_norm": 0.10571386830465022, "language_loss": 0.80179751, "learning_rate": 0.0001297926172306519, "loss": 0.81233323, "num_input_tokens_seen": 333207136, "router_z_loss_mlp": 0.09674072, "routerloss_mlp": 0.0, "step": 4015, "time_per_iteration": 2.65010142326355 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051804, "balance_loss_mlp": 1.04230273, "diversity_loss_mlp": 0.0, "epoch": 0.7726048480184686, "flos": 905688801792.0, "grad_norm": 0.06492582612573077, "language_loss": 0.7883606, "learning_rate": 0.0001295832858625055, "loss": 0.79887861, "num_input_tokens_seen": 333291920, "router_z_loss_mlp": 0.0949707, "routerloss_mlp": 0.0, "step": 4016, "time_per_iteration": 3.2565736770629883 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050538, "balance_loss_mlp": 1.04109037, "diversity_loss_mlp": 0.0, "epoch": 0.7727972297037322, "flos": 631380801024.0, "grad_norm": 0.06662088321139942, "language_loss": 0.70083648, "learning_rate": 0.00012937409830288154, "loss": 0.71134186, "num_input_tokens_seen": 333369824, "router_z_loss_mlp": 0.09442139, "routerloss_mlp": 0.0, "step": 4017, "time_per_iteration": 2.818197250366211 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046111, "balance_loss_mlp": 1.03688383, "diversity_loss_mlp": 0.0, "epoch": 0.7729896113889958, "flos": 414786147840.0, "grad_norm": 0.08953669234150197, "language_loss": 0.84953344, "learning_rate": 0.00012916505463299362, "loss": 0.85999447, "num_input_tokens_seen": 333434192, "router_z_loss_mlp": 0.09228516, "routerloss_mlp": 0.0, "step": 4018, "time_per_iteration": 2.5104525089263916 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104651, "balance_loss_mlp": 1.03696132, "diversity_loss_mlp": 0.0, "epoch": 0.7731819930742593, "flos": 668907694080.0, "grad_norm": 0.08710028809718832, "language_loss": 0.78235918, "learning_rate": 0.00012895615493399972, "loss": 0.79282427, "num_input_tokens_seen": 333509696, "router_z_loss_mlp": 0.09552002, "routerloss_mlp": 0.0, "step": 4019, "time_per_iteration": 2.7878103256225586 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104679, "balance_loss_mlp": 1.03747368, "diversity_loss_mlp": 0.0, "epoch": 0.7733743747595229, "flos": 489854615040.0, "grad_norm": 0.07808729146965544, "language_loss": 0.82637143, "learning_rate": 0.00012874739928700192, "loss": 0.83683932, "num_input_tokens_seen": 333575184, "router_z_loss_mlp": 0.09320068, "routerloss_mlp": 0.0, "step": 4020, "time_per_iteration": 2.5788097381591797 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044505, "balance_loss_mlp": 1.03501582, "diversity_loss_mlp": 0.0, "epoch": 0.7735667564447865, "flos": 659612325888.0, "grad_norm": 0.07324265685000747, "language_loss": 0.79874408, "learning_rate": 0.00012853878777304624, "loss": 0.80918914, "num_input_tokens_seen": 333651568, "router_z_loss_mlp": 0.09484863, "routerloss_mlp": 0.0, "step": 4021, "time_per_iteration": 2.870278835296631 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00794381, "balance_loss_mlp": 1.34430456, "diversity_loss_mlp": 0.22252312, "epoch": 0.77375913813005, "flos": 533383004160.0, "grad_norm": 0.029931863934209574, "language_loss": 0.84459031, "learning_rate": 0.000128330320473123, "loss": 0.85253412, "num_input_tokens_seen": 333726400, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01096685, "step": 4022, "time_per_iteration": 2.7129287719726562 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01008173, "balance_loss_mlp": 1.00330901, "diversity_loss_mlp": 0.0, "epoch": 0.7739515198153136, "flos": 1520081925120.0, "grad_norm": 0.013994594591819043, "language_loss": 0.783319, "learning_rate": 0.00012812199746816628, "loss": 0.7934007, "num_input_tokens_seen": 333960224, "router_z_loss_mlp": 0.04858398, "routerloss_mlp": 0.0, "step": 4023, "time_per_iteration": 4.895900726318359 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051725, "balance_loss_mlp": 1.04231346, "diversity_loss_mlp": 0.0, "epoch": 0.7741439015005771, "flos": 640105348608.0, "grad_norm": 0.07018696985022486, "language_loss": 0.81708258, "learning_rate": 0.0001279138188390543, "loss": 0.82759976, "num_input_tokens_seen": 334033904, "router_z_loss_mlp": 0.09405518, "routerloss_mlp": 0.0, "step": 4024, "time_per_iteration": 2.745079517364502 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050638, "balance_loss_mlp": 1.04130435, "diversity_loss_mlp": 0.0, "epoch": 0.7743362831858407, "flos": 665841420288.0, "grad_norm": 0.06486800405407347, "language_loss": 0.86009115, "learning_rate": 0.00012770578466660915, "loss": 0.87059748, "num_input_tokens_seen": 334107904, "router_z_loss_mlp": 0.09326172, "routerloss_mlp": 0.0, "step": 4025, "time_per_iteration": 2.848886013031006 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054208, "balance_loss_mlp": 1.04474843, "diversity_loss_mlp": 0.0, "epoch": 0.7745286648711043, "flos": 562760939520.0, "grad_norm": 0.06391594939980325, "language_loss": 0.81626999, "learning_rate": 0.0001274978950315968, "loss": 0.82681203, "num_input_tokens_seen": 334184048, "router_z_loss_mlp": 0.09454346, "routerloss_mlp": 0.0, "step": 4026, "time_per_iteration": 2.791773796081543 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104997, "balance_loss_mlp": 1.04037929, "diversity_loss_mlp": 0.0, "epoch": 0.7747210465563679, "flos": 516912565248.0, "grad_norm": 0.11270799389052534, "language_loss": 0.83240479, "learning_rate": 0.00012729015001472716, "loss": 0.84290445, "num_input_tokens_seen": 334257152, "router_z_loss_mlp": 0.0958252, "routerloss_mlp": 0.0, "step": 4027, "time_per_iteration": 2.6333580017089844 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051145, "balance_loss_mlp": 1.04164386, "diversity_loss_mlp": 0.0, "epoch": 0.7749134282416313, "flos": 634209937920.0, "grad_norm": 0.06039716871949276, "language_loss": 0.81597829, "learning_rate": 0.00012708254969665418, "loss": 0.82648969, "num_input_tokens_seen": 334331312, "router_z_loss_mlp": 0.0949707, "routerloss_mlp": 0.0, "step": 4028, "time_per_iteration": 2.753960132598877 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057336, "balance_loss_mlp": 1.0482347, "diversity_loss_mlp": 0.0, "epoch": 0.7751058099268949, "flos": 495364584960.0, "grad_norm": 0.08015627547619836, "language_loss": 0.83207834, "learning_rate": 0.00012687509415797526, "loss": 0.84265172, "num_input_tokens_seen": 334397344, "router_z_loss_mlp": 0.09100342, "routerloss_mlp": 0.0, "step": 4029, "time_per_iteration": 2.549224376678467 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055891, "balance_loss_mlp": 1.04669952, "diversity_loss_mlp": 0.0, "epoch": 0.7752981916121585, "flos": 510310513152.0, "grad_norm": 0.0754412874698092, "language_loss": 0.81577122, "learning_rate": 0.00012666778347923208, "loss": 0.82633013, "num_input_tokens_seen": 334467872, "router_z_loss_mlp": 0.09191895, "routerloss_mlp": 0.0, "step": 4030, "time_per_iteration": 2.6578049659729004 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058934, "balance_loss_mlp": 1.04996991, "diversity_loss_mlp": 0.0, "epoch": 0.7754905732974221, "flos": 497548749312.0, "grad_norm": 0.05434911795401194, "language_loss": 0.83884913, "learning_rate": 0.0001264606177409092, "loss": 0.84943849, "num_input_tokens_seen": 334539088, "router_z_loss_mlp": 0.08966064, "routerloss_mlp": 0.0, "step": 4031, "time_per_iteration": 2.7437548637390137 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054431, "balance_loss_mlp": 1.04539514, "diversity_loss_mlp": 0.0, "epoch": 0.7756829549826857, "flos": 480744626688.0, "grad_norm": 0.06981681066227559, "language_loss": 0.85926938, "learning_rate": 0.00012625359702343609, "loss": 0.86981368, "num_input_tokens_seen": 334612576, "router_z_loss_mlp": 0.09033203, "routerloss_mlp": 0.0, "step": 4032, "time_per_iteration": 2.7145252227783203 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01062978, "balance_loss_mlp": 1.05414999, "diversity_loss_mlp": 0.0, "epoch": 0.7758753366679492, "flos": 552630822912.0, "grad_norm": 0.06703655691775996, "language_loss": 0.84627414, "learning_rate": 0.00012604672140718504, "loss": 0.85690391, "num_input_tokens_seen": 334677824, "router_z_loss_mlp": 0.08831787, "routerloss_mlp": 0.0, "step": 4033, "time_per_iteration": 2.6776609420776367 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061314, "balance_loss_mlp": 1.05224824, "diversity_loss_mlp": 0.0, "epoch": 0.7760677183532128, "flos": 703835246592.0, "grad_norm": 0.0713724123127894, "language_loss": 0.77912575, "learning_rate": 0.00012583999097247233, "loss": 0.78973895, "num_input_tokens_seen": 334751456, "router_z_loss_mlp": 0.09069824, "routerloss_mlp": 0.0, "step": 4034, "time_per_iteration": 2.8429367542266846 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058219, "balance_loss_mlp": 1.04938531, "diversity_loss_mlp": 0.0, "epoch": 0.7762601000384763, "flos": 523470200832.0, "grad_norm": 0.07138701732892383, "language_loss": 0.80042505, "learning_rate": 0.0001256334057995578, "loss": 0.81100732, "num_input_tokens_seen": 334823008, "router_z_loss_mlp": 0.08837891, "routerloss_mlp": 0.0, "step": 4035, "time_per_iteration": 2.805361032485962 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060081, "balance_loss_mlp": 1.05109227, "diversity_loss_mlp": 0.0, "epoch": 0.7764524817237399, "flos": 557532896256.0, "grad_norm": 0.06152435345467902, "language_loss": 0.85125613, "learning_rate": 0.000125426965968645, "loss": 0.86185694, "num_input_tokens_seen": 334896048, "router_z_loss_mlp": 0.08996582, "routerloss_mlp": 0.0, "step": 4036, "time_per_iteration": 2.7150938510894775 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064057, "balance_loss_mlp": 1.05523515, "diversity_loss_mlp": 0.0, "epoch": 0.7766448634090035, "flos": 579725849088.0, "grad_norm": 0.07000613008602406, "language_loss": 0.819399, "learning_rate": 0.00012522067155988092, "loss": 0.83003962, "num_input_tokens_seen": 334964416, "router_z_loss_mlp": 0.08831787, "routerloss_mlp": 0.0, "step": 4037, "time_per_iteration": 2.6996352672576904 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060303, "balance_loss_mlp": 1.05135584, "diversity_loss_mlp": 0.0, "epoch": 0.776837245094267, "flos": 635603397120.0, "grad_norm": 0.0718823999319763, "language_loss": 0.75306779, "learning_rate": 0.00012501452265335617, "loss": 0.7636708, "num_input_tokens_seen": 335043360, "router_z_loss_mlp": 0.08947754, "routerloss_mlp": 0.0, "step": 4038, "time_per_iteration": 2.8315415382385254 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066111, "balance_loss_mlp": 1.05724156, "diversity_loss_mlp": 0.0, "epoch": 0.7770296267795306, "flos": 614680565760.0, "grad_norm": 0.06411925705378174, "language_loss": 0.83063197, "learning_rate": 0.0001248085193291047, "loss": 0.84129304, "num_input_tokens_seen": 335113216, "router_z_loss_mlp": 0.08862305, "routerloss_mlp": 0.0, "step": 4039, "time_per_iteration": 2.729095935821533 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01069535, "balance_loss_mlp": 1.0605464, "diversity_loss_mlp": 0.0, "epoch": 0.7772220084647942, "flos": 878808890880.0, "grad_norm": 0.05882048458025786, "language_loss": 0.82089669, "learning_rate": 0.00012460266166710443, "loss": 0.83159202, "num_input_tokens_seen": 335195824, "router_z_loss_mlp": 0.09002686, "routerloss_mlp": 0.0, "step": 4040, "time_per_iteration": 3.1514501571655273 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01068929, "balance_loss_mlp": 1.06013775, "diversity_loss_mlp": 0.0, "epoch": 0.7774143901500578, "flos": 839641489920.0, "grad_norm": 0.07867166554480139, "language_loss": 0.77746958, "learning_rate": 0.00012439694974727633, "loss": 0.78815889, "num_input_tokens_seen": 335269712, "router_z_loss_mlp": 0.0880127, "routerloss_mlp": 0.0, "step": 4041, "time_per_iteration": 3.0117955207824707 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01065961, "balance_loss_mlp": 1.05708027, "diversity_loss_mlp": 0.0, "epoch": 0.7776067718353212, "flos": 568147571712.0, "grad_norm": 0.06430167773545564, "language_loss": 0.79798543, "learning_rate": 0.00012419138364948458, "loss": 0.80864501, "num_input_tokens_seen": 335343408, "router_z_loss_mlp": 0.08880615, "routerloss_mlp": 0.0, "step": 4042, "time_per_iteration": 2.7055745124816895 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064858, "balance_loss_mlp": 1.05601263, "diversity_loss_mlp": 0.0, "epoch": 0.7777991535205848, "flos": 745943012352.0, "grad_norm": 0.06788477072783218, "language_loss": 0.82296908, "learning_rate": 0.00012398596345353702, "loss": 0.83361769, "num_input_tokens_seen": 335415360, "router_z_loss_mlp": 0.08850098, "routerloss_mlp": 0.0, "step": 4043, "time_per_iteration": 2.8943872451782227 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064992, "balance_loss_mlp": 1.05608058, "diversity_loss_mlp": 0.0, "epoch": 0.7779915352058484, "flos": 538075104768.0, "grad_norm": 0.06253380969554054, "language_loss": 0.83342338, "learning_rate": 0.0001237806892391851, "loss": 0.8440733, "num_input_tokens_seen": 335491568, "router_z_loss_mlp": 0.08917236, "routerloss_mlp": 0.0, "step": 4044, "time_per_iteration": 2.697079658508301 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061837, "balance_loss_mlp": 1.05312264, "diversity_loss_mlp": 0.0, "epoch": 0.778183916891112, "flos": 634788099072.0, "grad_norm": 0.07069263559946819, "language_loss": 0.81128013, "learning_rate": 0.0001235755610861233, "loss": 0.82189852, "num_input_tokens_seen": 335567200, "router_z_loss_mlp": 0.08728027, "routerloss_mlp": 0.0, "step": 4045, "time_per_iteration": 2.7329134941101074 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01066232, "balance_loss_mlp": 1.05731463, "diversity_loss_mlp": 0.0, "epoch": 0.7783762985763756, "flos": 588677621760.0, "grad_norm": 0.07032278053298287, "language_loss": 0.85504925, "learning_rate": 0.0001233705790739893, "loss": 0.86571157, "num_input_tokens_seen": 335640512, "router_z_loss_mlp": 0.0892334, "routerloss_mlp": 0.0, "step": 4046, "time_per_iteration": 2.708867073059082 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061968, "balance_loss_mlp": 1.05317056, "diversity_loss_mlp": 0.0, "epoch": 0.7785686802616391, "flos": 930656563200.0, "grad_norm": 0.08570945023626393, "language_loss": 0.7512747, "learning_rate": 0.0001231657432823643, "loss": 0.76189435, "num_input_tokens_seen": 335726016, "router_z_loss_mlp": 0.08813477, "routerloss_mlp": 0.0, "step": 4047, "time_per_iteration": 3.209035634994507 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01064295, "balance_loss_mlp": 1.05536008, "diversity_loss_mlp": 0.0, "epoch": 0.7787610619469026, "flos": 497934190080.0, "grad_norm": 0.07478772193794427, "language_loss": 0.78683329, "learning_rate": 0.0001229610537907725, "loss": 0.79747623, "num_input_tokens_seen": 335794864, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 4048, "time_per_iteration": 2.570645332336426 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01063203, "balance_loss_mlp": 1.05442929, "diversity_loss_mlp": 0.0, "epoch": 0.7789534436321662, "flos": 515637674496.0, "grad_norm": 0.07810921414498996, "language_loss": 0.90262878, "learning_rate": 0.00012275651067868143, "loss": 0.91326082, "num_input_tokens_seen": 335860928, "router_z_loss_mlp": 0.08776855, "routerloss_mlp": 0.0, "step": 4049, "time_per_iteration": 2.5862553119659424 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058533, "balance_loss_mlp": 1.04978311, "diversity_loss_mlp": 0.0, "epoch": 0.7791458253174298, "flos": 988476369408.0, "grad_norm": 0.05845393765756997, "language_loss": 0.80259252, "learning_rate": 0.00012255211402550182, "loss": 0.81317782, "num_input_tokens_seen": 335945728, "router_z_loss_mlp": 0.08758545, "routerloss_mlp": 0.0, "step": 4050, "time_per_iteration": 3.2020328044891357 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055369, "balance_loss_mlp": 1.04645181, "diversity_loss_mlp": 0.0, "epoch": 0.7793382070026933, "flos": 629040992256.0, "grad_norm": 0.07830185849799275, "language_loss": 0.76506507, "learning_rate": 0.00012234786391058727, "loss": 0.77561879, "num_input_tokens_seen": 336014848, "router_z_loss_mlp": 0.08917236, "routerloss_mlp": 0.0, "step": 4051, "time_per_iteration": 2.823751449584961 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059116, "balance_loss_mlp": 1.05021727, "diversity_loss_mlp": 0.0, "epoch": 0.7795305886879569, "flos": 531752408064.0, "grad_norm": 0.07934971719083544, "language_loss": 0.85162616, "learning_rate": 0.0001221437604132352, "loss": 0.86221731, "num_input_tokens_seen": 336080096, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 4052, "time_per_iteration": 2.6284594535827637 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054893, "balance_loss_mlp": 1.04598236, "diversity_loss_mlp": 0.0, "epoch": 0.7797229703732205, "flos": 611979909120.0, "grad_norm": 0.07077897315409304, "language_loss": 0.8102321, "learning_rate": 0.0001219398036126852, "loss": 0.82078099, "num_input_tokens_seen": 336154640, "router_z_loss_mlp": 0.08917236, "routerloss_mlp": 0.0, "step": 4053, "time_per_iteration": 2.7439231872558594 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059971, "balance_loss_mlp": 1.05101228, "diversity_loss_mlp": 0.0, "epoch": 0.7799153520584841, "flos": 872164620288.0, "grad_norm": 0.06870313821829518, "language_loss": 0.78245676, "learning_rate": 0.00012173599358812027, "loss": 0.79305649, "num_input_tokens_seen": 336244160, "router_z_loss_mlp": 0.08966064, "routerloss_mlp": 0.0, "step": 4054, "time_per_iteration": 3.256080150604248 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058619, "balance_loss_mlp": 1.04986334, "diversity_loss_mlp": 0.0, "epoch": 0.7801077337437476, "flos": 583627244544.0, "grad_norm": 0.07402592003625927, "language_loss": 0.82719493, "learning_rate": 0.0001215323304186668, "loss": 0.83778107, "num_input_tokens_seen": 336317936, "router_z_loss_mlp": 0.08758545, "routerloss_mlp": 0.0, "step": 4055, "time_per_iteration": 2.7612040042877197 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105856, "balance_loss_mlp": 1.05008435, "diversity_loss_mlp": 0.0, "epoch": 0.7803001154290111, "flos": 601165172736.0, "grad_norm": 0.06917846158934658, "language_loss": 0.87829256, "learning_rate": 0.00012132881418339364, "loss": 0.88887817, "num_input_tokens_seen": 336389504, "router_z_loss_mlp": 0.08483887, "routerloss_mlp": 0.0, "step": 4056, "time_per_iteration": 2.7365031242370605 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01006422, "balance_loss_mlp": 1.00186825, "diversity_loss_mlp": 0.0, "epoch": 0.7804924971142747, "flos": 1479577591296.0, "grad_norm": 0.016656968003394067, "language_loss": 0.77517563, "learning_rate": 0.00012112544496131306, "loss": 0.78523988, "num_input_tokens_seen": 336615536, "router_z_loss_mlp": 0.0456543, "routerloss_mlp": 0.0, "step": 4057, "time_per_iteration": 4.83305811882019 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105678, "balance_loss_mlp": 1.04785705, "diversity_loss_mlp": 0.0, "epoch": 0.7806848787995383, "flos": 630362870784.0, "grad_norm": 0.06805160455788861, "language_loss": 0.77303064, "learning_rate": 0.00012092222283137944, "loss": 0.78359842, "num_input_tokens_seen": 336686400, "router_z_loss_mlp": 0.08929443, "routerloss_mlp": 0.0, "step": 4058, "time_per_iteration": 2.749647617340088 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0100669, "balance_loss_mlp": 1.00213623, "diversity_loss_mlp": 0.0, "epoch": 0.7808772604848019, "flos": 1417587319296.0, "grad_norm": 0.014137874321597207, "language_loss": 0.7890631, "learning_rate": 0.00012071914787249111, "loss": 0.79913002, "num_input_tokens_seen": 336912704, "router_z_loss_mlp": 0.0456543, "routerloss_mlp": 0.0, "step": 4059, "time_per_iteration": 4.786531209945679 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060827, "balance_loss_mlp": 1.0521071, "diversity_loss_mlp": 0.0, "epoch": 0.7810696421700654, "flos": 731696011776.0, "grad_norm": 0.0627573295973092, "language_loss": 0.83679825, "learning_rate": 0.00012051622016348856, "loss": 0.84740651, "num_input_tokens_seen": 336997040, "router_z_loss_mlp": 0.08721924, "routerloss_mlp": 0.0, "step": 4060, "time_per_iteration": 2.999849557876587 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060136, "balance_loss_mlp": 1.05145788, "diversity_loss_mlp": 0.0, "epoch": 0.781262023855329, "flos": 424941230592.0, "grad_norm": 0.09064537340570315, "language_loss": 0.84317231, "learning_rate": 0.00012031343978315539, "loss": 0.85377359, "num_input_tokens_seen": 337059760, "router_z_loss_mlp": 0.08685303, "routerloss_mlp": 0.0, "step": 4061, "time_per_iteration": 2.468447208404541 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056381, "balance_loss_mlp": 1.04746997, "diversity_loss_mlp": 0.0, "epoch": 0.7814544055405925, "flos": 501027628032.0, "grad_norm": 0.06926307807295869, "language_loss": 0.8253361, "learning_rate": 0.00012011080681021774, "loss": 0.83589995, "num_input_tokens_seen": 337128528, "router_z_loss_mlp": 0.08917236, "routerloss_mlp": 0.0, "step": 4062, "time_per_iteration": 2.6554322242736816 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058674, "balance_loss_mlp": 1.04981685, "diversity_loss_mlp": 0.0, "epoch": 0.7816467872258561, "flos": 462448300032.0, "grad_norm": 0.07294593948757502, "language_loss": 0.86419785, "learning_rate": 0.00011990832132334512, "loss": 0.87478459, "num_input_tokens_seen": 337194112, "router_z_loss_mlp": 0.08862305, "routerloss_mlp": 0.0, "step": 4063, "time_per_iteration": 2.514464855194092 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054396, "balance_loss_mlp": 1.04535961, "diversity_loss_mlp": 0.0, "epoch": 0.7818391689111197, "flos": 740818483200.0, "grad_norm": 0.07578138035513655, "language_loss": 0.82624197, "learning_rate": 0.00011970598340114897, "loss": 0.83678591, "num_input_tokens_seen": 337270416, "router_z_loss_mlp": 0.09039307, "routerloss_mlp": 0.0, "step": 4064, "time_per_iteration": 2.931457042694092 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051575, "balance_loss_mlp": 1.04267633, "diversity_loss_mlp": 0.0, "epoch": 0.7820315505963832, "flos": 547669278720.0, "grad_norm": 0.07400316047770077, "language_loss": 0.84204572, "learning_rate": 0.00011950379312218396, "loss": 0.85256147, "num_input_tokens_seen": 337343024, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 4065, "time_per_iteration": 2.7011330127716064 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053821, "balance_loss_mlp": 1.04467154, "diversity_loss_mlp": 0.0, "epoch": 0.7822239322816468, "flos": 728983245312.0, "grad_norm": 0.057956585414562535, "language_loss": 0.86203766, "learning_rate": 0.00011930175056494719, "loss": 0.87257588, "num_input_tokens_seen": 337417232, "router_z_loss_mlp": 0.09155273, "routerloss_mlp": 0.0, "step": 4066, "time_per_iteration": 2.877427816390991 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054242, "balance_loss_mlp": 1.04519939, "diversity_loss_mlp": 0.0, "epoch": 0.7824163139669104, "flos": 452016433152.0, "grad_norm": 0.057083401886059204, "language_loss": 0.75923216, "learning_rate": 0.00011909985580787885, "loss": 0.76977456, "num_input_tokens_seen": 337488224, "router_z_loss_mlp": 0.09051514, "routerloss_mlp": 0.0, "step": 4067, "time_per_iteration": 2.624633312225342 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047724, "balance_loss_mlp": 1.03850365, "diversity_loss_mlp": 0.0, "epoch": 0.782608695652174, "flos": 540489065472.0, "grad_norm": 0.05949124262263275, "language_loss": 0.81228232, "learning_rate": 0.00011889810892936137, "loss": 0.82275951, "num_input_tokens_seen": 337564928, "router_z_loss_mlp": 0.09216309, "routerloss_mlp": 0.0, "step": 4068, "time_per_iteration": 2.736132860183716 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060005, "balance_loss_mlp": 1.05080259, "diversity_loss_mlp": 0.0, "epoch": 0.7828010773374374, "flos": 500308503552.0, "grad_norm": 0.067986892151795, "language_loss": 0.77103662, "learning_rate": 0.00011869651000771959, "loss": 0.78163677, "num_input_tokens_seen": 337641632, "router_z_loss_mlp": 0.09197998, "routerloss_mlp": 0.0, "step": 4069, "time_per_iteration": 2.8403103351593018 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054344, "balance_loss_mlp": 1.04549229, "diversity_loss_mlp": 0.0, "epoch": 0.782993459022701, "flos": 600816807936.0, "grad_norm": 0.06684521190560817, "language_loss": 0.83076346, "learning_rate": 0.00011849505912122117, "loss": 0.84130692, "num_input_tokens_seen": 337711968, "router_z_loss_mlp": 0.08868408, "routerloss_mlp": 0.0, "step": 4070, "time_per_iteration": 2.7008423805236816 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054175, "balance_loss_mlp": 1.04501987, "diversity_loss_mlp": 0.0, "epoch": 0.7831858407079646, "flos": 810055779840.0, "grad_norm": 0.07690857771038405, "language_loss": 0.78090364, "learning_rate": 0.00011829375634807654, "loss": 0.79144537, "num_input_tokens_seen": 337795792, "router_z_loss_mlp": 0.09155273, "routerloss_mlp": 0.0, "step": 4071, "time_per_iteration": 3.033573627471924 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054971, "balance_loss_mlp": 1.04576814, "diversity_loss_mlp": 0.0, "epoch": 0.7833782223932282, "flos": 806594153472.0, "grad_norm": 0.056420463967120596, "language_loss": 0.81179786, "learning_rate": 0.00011809260176643821, "loss": 0.82234752, "num_input_tokens_seen": 337875584, "router_z_loss_mlp": 0.09204102, "routerloss_mlp": 0.0, "step": 4072, "time_per_iteration": 3.047667980194092 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057919, "balance_loss_mlp": 1.0486629, "diversity_loss_mlp": 0.0, "epoch": 0.7835706040784918, "flos": 520870860288.0, "grad_norm": 0.08201668927537556, "language_loss": 0.83855987, "learning_rate": 0.00011789159545440131, "loss": 0.84913909, "num_input_tokens_seen": 337942304, "router_z_loss_mlp": 0.09246826, "routerloss_mlp": 0.0, "step": 4073, "time_per_iteration": 2.5870485305786133 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061783, "balance_loss_mlp": 1.05281854, "diversity_loss_mlp": 0.0, "epoch": 0.7837629857637552, "flos": 505605929472.0, "grad_norm": 0.05483100075639626, "language_loss": 0.82342023, "learning_rate": 0.00011769073749000348, "loss": 0.83403808, "num_input_tokens_seen": 338020864, "router_z_loss_mlp": 0.08972168, "routerloss_mlp": 0.0, "step": 4074, "time_per_iteration": 2.7744524478912354 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059737, "balance_loss_mlp": 1.05058742, "diversity_loss_mlp": 0.0, "epoch": 0.7839553674490188, "flos": 516124431360.0, "grad_norm": 0.07650558225741275, "language_loss": 0.76181698, "learning_rate": 0.0001174900279512246, "loss": 0.77241433, "num_input_tokens_seen": 338089584, "router_z_loss_mlp": 0.09155273, "routerloss_mlp": 0.0, "step": 4075, "time_per_iteration": 2.5718233585357666 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055959, "balance_loss_mlp": 1.04716742, "diversity_loss_mlp": 0.0, "epoch": 0.7841477491342824, "flos": 506648825856.0, "grad_norm": 0.06638794146044662, "language_loss": 0.81755495, "learning_rate": 0.00011728946691598707, "loss": 0.82811451, "num_input_tokens_seen": 338159568, "router_z_loss_mlp": 0.0880127, "routerloss_mlp": 0.0, "step": 4076, "time_per_iteration": 2.597710371017456 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057965, "balance_loss_mlp": 1.0489229, "diversity_loss_mlp": 0.0, "epoch": 0.784340130819546, "flos": 719636120064.0, "grad_norm": 0.07312696414479496, "language_loss": 0.76038092, "learning_rate": 0.00011708905446215561, "loss": 0.77096057, "num_input_tokens_seen": 338233952, "router_z_loss_mlp": 0.0904541, "routerloss_mlp": 0.0, "step": 4077, "time_per_iteration": 2.8587801456451416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052857, "balance_loss_mlp": 1.04389191, "diversity_loss_mlp": 0.0, "epoch": 0.7845325125048095, "flos": 514441704960.0, "grad_norm": 0.05480426452035972, "language_loss": 0.79978698, "learning_rate": 0.00011688879066753711, "loss": 0.81031561, "num_input_tokens_seen": 338309568, "router_z_loss_mlp": 0.08959961, "routerloss_mlp": 0.0, "step": 4078, "time_per_iteration": 2.6878645420074463 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00794674, "balance_loss_mlp": 1.3435601, "diversity_loss_mlp": 0.22424069, "epoch": 0.7847248941900731, "flos": 466102646784.0, "grad_norm": 0.037025249970490705, "language_loss": 0.87360638, "learning_rate": 0.00011668867560988122, "loss": 0.88155311, "num_input_tokens_seen": 338375920, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01077335, "step": 4079, "time_per_iteration": 2.605992317199707 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055214, "balance_loss_mlp": 1.04603505, "diversity_loss_mlp": 0.0, "epoch": 0.7849172758753367, "flos": 503028983808.0, "grad_norm": 0.07540056238596937, "language_loss": 0.84502101, "learning_rate": 0.00011648870936687916, "loss": 0.85557318, "num_input_tokens_seen": 338452208, "router_z_loss_mlp": 0.09173584, "routerloss_mlp": 0.0, "step": 4080, "time_per_iteration": 2.803166627883911 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054284, "balance_loss_mlp": 1.04527164, "diversity_loss_mlp": 0.0, "epoch": 0.7851096575606002, "flos": 531999456768.0, "grad_norm": 0.07109491685615342, "language_loss": 0.7888999, "learning_rate": 0.00011628889201616461, "loss": 0.79944277, "num_input_tokens_seen": 338522864, "router_z_loss_mlp": 0.09014893, "routerloss_mlp": 0.0, "step": 4081, "time_per_iteration": 2.6307146549224854 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053935, "balance_loss_mlp": 1.04494071, "diversity_loss_mlp": 0.0, "epoch": 0.7853020392458638, "flos": 569956207104.0, "grad_norm": 0.06995649688675094, "language_loss": 0.8206296, "learning_rate": 0.00011608922363531393, "loss": 0.83116901, "num_input_tokens_seen": 338591024, "router_z_loss_mlp": 0.09002686, "routerloss_mlp": 0.0, "step": 4082, "time_per_iteration": 2.6929171085357666 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054913, "balance_loss_mlp": 1.04621124, "diversity_loss_mlp": 0.0, "epoch": 0.7854944209311273, "flos": 832579845120.0, "grad_norm": 0.06467745732761603, "language_loss": 0.83401716, "learning_rate": 0.00011588970430184504, "loss": 0.84456635, "num_input_tokens_seen": 338669616, "router_z_loss_mlp": 0.08709717, "routerloss_mlp": 0.0, "step": 4083, "time_per_iteration": 3.0374722480773926 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055907, "balance_loss_mlp": 1.04704356, "diversity_loss_mlp": 0.0, "epoch": 0.7856868026163909, "flos": 559929604608.0, "grad_norm": 0.053416444226472466, "language_loss": 0.81812388, "learning_rate": 0.00011569033409321822, "loss": 0.82868296, "num_input_tokens_seen": 338740416, "router_z_loss_mlp": 0.08874512, "routerloss_mlp": 0.0, "step": 4084, "time_per_iteration": 2.7151241302490234 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056433, "balance_loss_mlp": 1.04721808, "diversity_loss_mlp": 0.0, "epoch": 0.7858791843016545, "flos": 545230725120.0, "grad_norm": 0.08362128305368578, "language_loss": 0.72967046, "learning_rate": 0.00011549111308683591, "loss": 0.74023485, "num_input_tokens_seen": 338807664, "router_z_loss_mlp": 0.09216309, "routerloss_mlp": 0.0, "step": 4085, "time_per_iteration": 2.703397750854492 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053784, "balance_loss_mlp": 1.044855, "diversity_loss_mlp": 0.0, "epoch": 0.7860715659869181, "flos": 380997665280.0, "grad_norm": 0.07026628399198086, "language_loss": 0.80478334, "learning_rate": 0.00011529204136004251, "loss": 0.81532121, "num_input_tokens_seen": 338869472, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 4086, "time_per_iteration": 2.4818243980407715 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055251, "balance_loss_mlp": 1.04632854, "diversity_loss_mlp": 0.0, "epoch": 0.7862639476721817, "flos": 567440930304.0, "grad_norm": 0.06468878784636958, "language_loss": 0.84670031, "learning_rate": 0.00011509311899012459, "loss": 0.85725284, "num_input_tokens_seen": 338941312, "router_z_loss_mlp": 0.08929443, "routerloss_mlp": 0.0, "step": 4087, "time_per_iteration": 2.6685831546783447 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052321, "balance_loss_mlp": 1.04333234, "diversity_loss_mlp": 0.0, "epoch": 0.7864563293574451, "flos": 545238065664.0, "grad_norm": 0.07857696263976417, "language_loss": 0.781057, "learning_rate": 0.00011489434605431053, "loss": 0.7915802, "num_input_tokens_seen": 339010208, "router_z_loss_mlp": 0.08996582, "routerloss_mlp": 0.0, "step": 4088, "time_per_iteration": 2.634192705154419 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050771, "balance_loss_mlp": 1.0415858, "diversity_loss_mlp": 0.0, "epoch": 0.7866487110427087, "flos": 563536963584.0, "grad_norm": 0.06849593864396217, "language_loss": 0.81194121, "learning_rate": 0.0001146957226297708, "loss": 0.82244897, "num_input_tokens_seen": 339081232, "router_z_loss_mlp": 0.09185791, "routerloss_mlp": 0.0, "step": 4089, "time_per_iteration": 2.6896586418151855 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054397, "balance_loss_mlp": 1.04508066, "diversity_loss_mlp": 0.0, "epoch": 0.7868410927279723, "flos": 728189968896.0, "grad_norm": 0.06226549816004976, "language_loss": 0.76514363, "learning_rate": 0.00011449724879361827, "loss": 0.77568758, "num_input_tokens_seen": 339161040, "router_z_loss_mlp": 0.09307861, "routerloss_mlp": 0.0, "step": 4090, "time_per_iteration": 3.0211868286132812 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105233, "balance_loss_mlp": 1.04349613, "diversity_loss_mlp": 0.0, "epoch": 0.7870334744132359, "flos": 521355045888.0, "grad_norm": 0.10606387135755017, "language_loss": 0.73947829, "learning_rate": 0.00011429892462290687, "loss": 0.75000155, "num_input_tokens_seen": 339233984, "router_z_loss_mlp": 0.08843994, "routerloss_mlp": 0.0, "step": 4091, "time_per_iteration": 2.663403034210205 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051781, "balance_loss_mlp": 1.04245293, "diversity_loss_mlp": 0.0, "epoch": 0.7872258560984994, "flos": 451411107840.0, "grad_norm": 0.07444773057019392, "language_loss": 0.83167046, "learning_rate": 0.00011410075019463295, "loss": 0.84218824, "num_input_tokens_seen": 339303168, "router_z_loss_mlp": 0.09326172, "routerloss_mlp": 0.0, "step": 4092, "time_per_iteration": 2.6732146739959717 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048957, "balance_loss_mlp": 1.04006362, "diversity_loss_mlp": 0.0, "epoch": 0.787418237783763, "flos": 515195334144.0, "grad_norm": 0.060787527331610934, "language_loss": 0.80152667, "learning_rate": 0.00011390272558573461, "loss": 0.81201625, "num_input_tokens_seen": 339374512, "router_z_loss_mlp": 0.08905029, "routerloss_mlp": 0.0, "step": 4093, "time_per_iteration": 2.7180373668670654 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046439, "balance_loss_mlp": 1.03762388, "diversity_loss_mlp": 0.0, "epoch": 0.7876106194690266, "flos": 485081021952.0, "grad_norm": 0.06490792600835427, "language_loss": 0.7982657, "learning_rate": 0.00011370485087309202, "loss": 0.80873013, "num_input_tokens_seen": 339442720, "router_z_loss_mlp": 0.0881958, "routerloss_mlp": 0.0, "step": 4094, "time_per_iteration": 2.6366312503814697 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049978, "balance_loss_mlp": 1.04087603, "diversity_loss_mlp": 0.0, "epoch": 0.7878030011542901, "flos": 542841357312.0, "grad_norm": 0.07475345031561743, "language_loss": 0.79215139, "learning_rate": 0.00011350712613352688, "loss": 0.80265117, "num_input_tokens_seen": 339508800, "router_z_loss_mlp": 0.09106445, "routerloss_mlp": 0.0, "step": 4095, "time_per_iteration": 2.652498960494995 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046751, "balance_loss_mlp": 1.0379113, "diversity_loss_mlp": 0.0, "epoch": 0.7879953828395537, "flos": 516739668480.0, "grad_norm": 0.08748048466921367, "language_loss": 0.79438257, "learning_rate": 0.00011330955144380283, "loss": 0.8048501, "num_input_tokens_seen": 339578048, "router_z_loss_mlp": 0.08850098, "routerloss_mlp": 0.0, "step": 4096, "time_per_iteration": 2.641091823577881 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051654, "balance_loss_mlp": 1.04231441, "diversity_loss_mlp": 0.0, "epoch": 0.7881877645248172, "flos": 582278201856.0, "grad_norm": 0.09762790842246886, "language_loss": 0.8590734, "learning_rate": 0.00011311212688062483, "loss": 0.86958992, "num_input_tokens_seen": 339650176, "router_z_loss_mlp": 0.09338379, "routerloss_mlp": 0.0, "step": 4097, "time_per_iteration": 2.7734925746917725 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104816, "balance_loss_mlp": 1.03907609, "diversity_loss_mlp": 0.0, "epoch": 0.7883801462100808, "flos": 589171719168.0, "grad_norm": 0.07905994769378807, "language_loss": 0.77729434, "learning_rate": 0.0001129148525206402, "loss": 0.78777593, "num_input_tokens_seen": 339727312, "router_z_loss_mlp": 0.09075928, "routerloss_mlp": 0.0, "step": 4098, "time_per_iteration": 2.7954680919647217 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043495, "balance_loss_mlp": 1.03457785, "diversity_loss_mlp": 0.0, "epoch": 0.7885725278953444, "flos": 481728052224.0, "grad_norm": 0.07239705861159748, "language_loss": 0.86597443, "learning_rate": 0.00011271772844043759, "loss": 0.87640929, "num_input_tokens_seen": 339801344, "router_z_loss_mlp": 0.08917236, "routerloss_mlp": 0.0, "step": 4099, "time_per_iteration": 2.6607439517974854 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045104, "balance_loss_mlp": 1.03621721, "diversity_loss_mlp": 0.0, "epoch": 0.788764909580608, "flos": 756794824704.0, "grad_norm": 0.0879845315874332, "language_loss": 0.76285118, "learning_rate": 0.00011252075471654727, "loss": 0.7733022, "num_input_tokens_seen": 339877840, "router_z_loss_mlp": 0.08892822, "routerloss_mlp": 0.0, "step": 4100, "time_per_iteration": 2.971648693084717 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105122, "balance_loss_mlp": 1.04207063, "diversity_loss_mlp": 0.0, "epoch": 0.7889572912658714, "flos": 702555213312.0, "grad_norm": 0.0764302871750087, "language_loss": 0.77711362, "learning_rate": 0.00011232393142544133, "loss": 0.78762579, "num_input_tokens_seen": 339959568, "router_z_loss_mlp": 0.0914917, "routerloss_mlp": 0.0, "step": 4101, "time_per_iteration": 2.91229510307312 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047622, "balance_loss_mlp": 1.03860378, "diversity_loss_mlp": 0.0, "epoch": 0.789149672951135, "flos": 736405364736.0, "grad_norm": 0.07185195333789275, "language_loss": 0.82940054, "learning_rate": 0.00011212725864353323, "loss": 0.83987677, "num_input_tokens_seen": 340043600, "router_z_loss_mlp": 0.09020996, "routerloss_mlp": 0.0, "step": 4102, "time_per_iteration": 3.1023645401000977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01025318, "balance_loss_mlp": 1.02088332, "diversity_loss_mlp": 0.0, "epoch": 0.7893420546363986, "flos": 1481396511744.0, "grad_norm": 0.024083596003167965, "language_loss": 0.76335925, "learning_rate": 0.00011193073644717822, "loss": 0.77361244, "num_input_tokens_seen": 340270608, "router_z_loss_mlp": 0.04443359, "routerloss_mlp": 0.0, "step": 4103, "time_per_iteration": 4.869060754776001 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045889, "balance_loss_mlp": 1.03684092, "diversity_loss_mlp": 0.0, "epoch": 0.7895344363216622, "flos": 509072698368.0, "grad_norm": 0.08808407727788632, "language_loss": 0.75807375, "learning_rate": 0.00011173436491267291, "loss": 0.76853269, "num_input_tokens_seen": 340338784, "router_z_loss_mlp": 0.09051514, "routerloss_mlp": 0.0, "step": 4104, "time_per_iteration": 2.632619619369507 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051432, "balance_loss_mlp": 1.04226446, "diversity_loss_mlp": 0.0, "epoch": 0.7897268180069258, "flos": 541988983296.0, "grad_norm": 0.06591293045265766, "language_loss": 0.81841874, "learning_rate": 0.0001115381441162554, "loss": 0.82893306, "num_input_tokens_seen": 340407744, "router_z_loss_mlp": 0.09155273, "routerloss_mlp": 0.0, "step": 4105, "time_per_iteration": 2.6688740253448486 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01015618, "balance_loss_mlp": 1.0112071, "diversity_loss_mlp": 0.0, "epoch": 0.7899191996921893, "flos": 1412687817216.0, "grad_norm": 0.01578072375455914, "language_loss": 0.73583722, "learning_rate": 0.00011134207413410557, "loss": 0.74599338, "num_input_tokens_seen": 340635824, "router_z_loss_mlp": 0.04418945, "routerloss_mlp": 0.0, "step": 4106, "time_per_iteration": 4.878762245178223 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050683, "balance_loss_mlp": 1.041677, "diversity_loss_mlp": 0.0, "epoch": 0.7901115813774529, "flos": 622841633280.0, "grad_norm": 0.06419159755656932, "language_loss": 0.85182965, "learning_rate": 0.00011114615504234465, "loss": 0.86233652, "num_input_tokens_seen": 340710928, "router_z_loss_mlp": 0.09014893, "routerloss_mlp": 0.0, "step": 4107, "time_per_iteration": 2.7453701496124268 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046515, "balance_loss_mlp": 1.03746724, "diversity_loss_mlp": 0.0, "epoch": 0.7903039630627164, "flos": 645545935872.0, "grad_norm": 0.07341048206377168, "language_loss": 0.80923963, "learning_rate": 0.00011095038691703468, "loss": 0.81970477, "num_input_tokens_seen": 340786128, "router_z_loss_mlp": 0.09057617, "routerloss_mlp": 0.0, "step": 4108, "time_per_iteration": 2.857043504714966 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047055, "balance_loss_mlp": 1.03800678, "diversity_loss_mlp": 0.0, "epoch": 0.79049634474798, "flos": 594365257728.0, "grad_norm": 0.06655370110946672, "language_loss": 0.82816958, "learning_rate": 0.00011075476983417998, "loss": 0.83864009, "num_input_tokens_seen": 340861616, "router_z_loss_mlp": 0.0904541, "routerloss_mlp": 0.0, "step": 4109, "time_per_iteration": 2.8551764488220215 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049932, "balance_loss_mlp": 1.04054475, "diversity_loss_mlp": 0.0, "epoch": 0.7906887264332435, "flos": 716093001216.0, "grad_norm": 0.08565145998771567, "language_loss": 0.7770009, "learning_rate": 0.00011055930386972579, "loss": 0.78750026, "num_input_tokens_seen": 340934480, "router_z_loss_mlp": 0.09387207, "routerloss_mlp": 0.0, "step": 4110, "time_per_iteration": 2.9051218032836914 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104864, "balance_loss_mlp": 1.03950906, "diversity_loss_mlp": 0.0, "epoch": 0.7908811081185071, "flos": 789893918208.0, "grad_norm": 0.07889594156212229, "language_loss": 0.78524226, "learning_rate": 0.00011036398909955863, "loss": 0.79572868, "num_input_tokens_seen": 341014912, "router_z_loss_mlp": 0.09136963, "routerloss_mlp": 0.0, "step": 4111, "time_per_iteration": 2.9591848850250244 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00801967, "balance_loss_mlp": 1.35861206, "diversity_loss_mlp": 0.22341654, "epoch": 0.7910734898037707, "flos": 641904072192.0, "grad_norm": 0.031814716701276446, "language_loss": 0.81445456, "learning_rate": 0.00011016882559950648, "loss": 0.82247424, "num_input_tokens_seen": 341090608, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0109526, "step": 4112, "time_per_iteration": 2.8517532348632812 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049762, "balance_loss_mlp": 1.04066622, "diversity_loss_mlp": 0.0, "epoch": 0.7912658714890343, "flos": 669357374976.0, "grad_norm": 0.06825914372029093, "language_loss": 0.80628312, "learning_rate": 0.00010997381344533853, "loss": 0.81678075, "num_input_tokens_seen": 341160992, "router_z_loss_mlp": 0.09088135, "routerloss_mlp": 0.0, "step": 4113, "time_per_iteration": 2.76458477973938 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054223, "balance_loss_mlp": 1.04482937, "diversity_loss_mlp": 0.0, "epoch": 0.7914582531742979, "flos": 557779944960.0, "grad_norm": 0.06296725861693256, "language_loss": 0.80975449, "learning_rate": 0.00010977895271276517, "loss": 0.82029676, "num_input_tokens_seen": 341232032, "router_z_loss_mlp": 0.09393311, "routerloss_mlp": 0.0, "step": 4114, "time_per_iteration": 2.677236795425415 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105271, "balance_loss_mlp": 1.04387641, "diversity_loss_mlp": 0.0, "epoch": 0.7916506348595613, "flos": 570064863744.0, "grad_norm": 0.07698010071595295, "language_loss": 0.79882276, "learning_rate": 0.00010958424347743807, "loss": 0.80934995, "num_input_tokens_seen": 341303888, "router_z_loss_mlp": 0.08843994, "routerloss_mlp": 0.0, "step": 4115, "time_per_iteration": 2.7255280017852783 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056753, "balance_loss_mlp": 1.04793203, "diversity_loss_mlp": 0.0, "epoch": 0.7918430165448249, "flos": 718301758464.0, "grad_norm": 0.06323084510093162, "language_loss": 0.80379033, "learning_rate": 0.00010938968581494991, "loss": 0.81435782, "num_input_tokens_seen": 341385616, "router_z_loss_mlp": 0.08825684, "routerloss_mlp": 0.0, "step": 4116, "time_per_iteration": 2.956744909286499 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056354, "balance_loss_mlp": 1.0473659, "diversity_loss_mlp": 0.0, "epoch": 0.7920353982300885, "flos": 553648753152.0, "grad_norm": 0.07593804019744407, "language_loss": 0.78918922, "learning_rate": 0.000109195279800835, "loss": 0.79975271, "num_input_tokens_seen": 341460976, "router_z_loss_mlp": 0.08990479, "routerloss_mlp": 0.0, "step": 4117, "time_per_iteration": 2.7232017517089844 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052824, "balance_loss_mlp": 1.04372239, "diversity_loss_mlp": 0.0, "epoch": 0.7922277799153521, "flos": 810120019968.0, "grad_norm": 0.07668598230710005, "language_loss": 0.76558191, "learning_rate": 0.00010900102551056834, "loss": 0.77611017, "num_input_tokens_seen": 341537328, "router_z_loss_mlp": 0.09106445, "routerloss_mlp": 0.0, "step": 4118, "time_per_iteration": 3.0348682403564453 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105099, "balance_loss_mlp": 1.04203153, "diversity_loss_mlp": 0.0, "epoch": 0.7924201616006156, "flos": 421351123968.0, "grad_norm": 0.06933579681898581, "language_loss": 0.8458457, "learning_rate": 0.00010880692301956601, "loss": 0.85635561, "num_input_tokens_seen": 341600272, "router_z_loss_mlp": 0.08959961, "routerloss_mlp": 0.0, "step": 4119, "time_per_iteration": 2.465395212173462 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059146, "balance_loss_mlp": 1.05027056, "diversity_loss_mlp": 0.0, "epoch": 0.7926125432858792, "flos": 617852924928.0, "grad_norm": 0.06493837690301978, "language_loss": 0.86651456, "learning_rate": 0.00010861297240318518, "loss": 0.87710601, "num_input_tokens_seen": 341682096, "router_z_loss_mlp": 0.08886719, "routerloss_mlp": 0.0, "step": 4120, "time_per_iteration": 2.8506181240081787 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056946, "balance_loss_mlp": 1.04826188, "diversity_loss_mlp": 0.0, "epoch": 0.7928049249711427, "flos": 602487051264.0, "grad_norm": 0.07524766323731863, "language_loss": 0.87229133, "learning_rate": 0.00010841917373672444, "loss": 0.88286078, "num_input_tokens_seen": 341754912, "router_z_loss_mlp": 0.0869751, "routerloss_mlp": 0.0, "step": 4121, "time_per_iteration": 2.745227336883545 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055842, "balance_loss_mlp": 1.04712808, "diversity_loss_mlp": 0.0, "epoch": 0.7929973066564063, "flos": 656024790528.0, "grad_norm": 0.08118940133699648, "language_loss": 0.78629029, "learning_rate": 0.00010822552709542293, "loss": 0.79684877, "num_input_tokens_seen": 341831152, "router_z_loss_mlp": 0.0871582, "routerloss_mlp": 0.0, "step": 4122, "time_per_iteration": 2.813340425491333 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055553, "balance_loss_mlp": 1.04677343, "diversity_loss_mlp": 0.0, "epoch": 0.7931896883416699, "flos": 536397520896.0, "grad_norm": 0.058728515527731805, "language_loss": 0.86142117, "learning_rate": 0.0001080320325544612, "loss": 0.87197673, "num_input_tokens_seen": 341903552, "router_z_loss_mlp": 0.08782959, "routerloss_mlp": 0.0, "step": 4123, "time_per_iteration": 2.6903398036956787 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053347, "balance_loss_mlp": 1.04438257, "diversity_loss_mlp": 0.0, "epoch": 0.7933820700269334, "flos": 498082493952.0, "grad_norm": 0.06377375336372411, "language_loss": 0.83519953, "learning_rate": 0.00010783869018895997, "loss": 0.84573305, "num_input_tokens_seen": 341972256, "router_z_loss_mlp": 0.08966064, "routerloss_mlp": 0.0, "step": 4124, "time_per_iteration": 2.6091437339782715 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055841, "balance_loss_mlp": 1.04709673, "diversity_loss_mlp": 0.0, "epoch": 0.793574451712197, "flos": 537472350720.0, "grad_norm": 0.06290112703691109, "language_loss": 0.84019685, "learning_rate": 0.00010764550007398189, "loss": 0.85075527, "num_input_tokens_seen": 342040496, "router_z_loss_mlp": 0.08752441, "routerloss_mlp": 0.0, "step": 4125, "time_per_iteration": 2.639021396636963 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105337, "balance_loss_mlp": 1.04447079, "diversity_loss_mlp": 0.0, "epoch": 0.7937668333974606, "flos": 488285687808.0, "grad_norm": 0.059983052052207615, "language_loss": 0.81026101, "learning_rate": 0.00010745246228452982, "loss": 0.8207947, "num_input_tokens_seen": 342108512, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 4126, "time_per_iteration": 2.567128896713257 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055054, "balance_loss_mlp": 1.04658413, "diversity_loss_mlp": 0.0, "epoch": 0.7939592150827242, "flos": 527425924608.0, "grad_norm": 0.06538981258691282, "language_loss": 0.81837595, "learning_rate": 0.00010725957689554771, "loss": 0.82892644, "num_input_tokens_seen": 342183568, "router_z_loss_mlp": 0.08477783, "routerloss_mlp": 0.0, "step": 4127, "time_per_iteration": 2.7668473720550537 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105364, "balance_loss_mlp": 1.04483056, "diversity_loss_mlp": 0.0, "epoch": 0.7941515967679876, "flos": 541702287360.0, "grad_norm": 0.06455760363891609, "language_loss": 0.84442085, "learning_rate": 0.00010706684398192013, "loss": 0.85495722, "num_input_tokens_seen": 342259920, "router_z_loss_mlp": 0.08813477, "routerloss_mlp": 0.0, "step": 4128, "time_per_iteration": 2.703094482421875 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056081, "balance_loss_mlp": 1.04694915, "diversity_loss_mlp": 0.0, "epoch": 0.7943439784532512, "flos": 518387516928.0, "grad_norm": 0.10398066376678644, "language_loss": 0.81773114, "learning_rate": 0.00010687426361847313, "loss": 0.82829189, "num_input_tokens_seen": 342330192, "router_z_loss_mlp": 0.09124756, "routerloss_mlp": 0.0, "step": 4129, "time_per_iteration": 2.730570077896118 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054699, "balance_loss_mlp": 1.04571033, "diversity_loss_mlp": 0.0, "epoch": 0.7945363601385148, "flos": 509025710592.0, "grad_norm": 0.06937610081260179, "language_loss": 0.8574326, "learning_rate": 0.00010668183587997254, "loss": 0.86797965, "num_input_tokens_seen": 342398944, "router_z_loss_mlp": 0.08996582, "routerloss_mlp": 0.0, "step": 4130, "time_per_iteration": 2.644259452819824 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051071, "balance_loss_mlp": 1.04217792, "diversity_loss_mlp": 0.0, "epoch": 0.7947287418237784, "flos": 651214121472.0, "grad_norm": 0.05953600763070223, "language_loss": 0.77579701, "learning_rate": 0.0001064895608411256, "loss": 0.78630781, "num_input_tokens_seen": 342474000, "router_z_loss_mlp": 0.08892822, "routerloss_mlp": 0.0, "step": 4131, "time_per_iteration": 2.841925859451294 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105178, "balance_loss_mlp": 1.04286337, "diversity_loss_mlp": 0.0, "epoch": 0.794921123509042, "flos": 696054477312.0, "grad_norm": 0.06486183241314894, "language_loss": 0.80494809, "learning_rate": 0.00010629743857657998, "loss": 0.81546587, "num_input_tokens_seen": 342549184, "router_z_loss_mlp": 0.08917236, "routerloss_mlp": 0.0, "step": 4132, "time_per_iteration": 2.9550116062164307 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01007878, "balance_loss_mlp": 1.00334787, "diversity_loss_mlp": 0.0, "epoch": 0.7951135051943055, "flos": 1402942768128.0, "grad_norm": 0.014279472424614392, "language_loss": 0.70598668, "learning_rate": 0.0001061054691609244, "loss": 0.71606547, "num_input_tokens_seen": 342767376, "router_z_loss_mlp": 0.04541016, "routerloss_mlp": 0.0, "step": 4133, "time_per_iteration": 4.61087965965271 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059576, "balance_loss_mlp": 1.05091596, "diversity_loss_mlp": 0.0, "epoch": 0.795305886879569, "flos": 810085515264.0, "grad_norm": 0.08419096338195846, "language_loss": 0.82037973, "learning_rate": 0.00010591365266868802, "loss": 0.83097553, "num_input_tokens_seen": 342845024, "router_z_loss_mlp": 0.08666992, "routerloss_mlp": 0.0, "step": 4134, "time_per_iteration": 2.980473518371582 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01006707, "balance_loss_mlp": 1.00217748, "diversity_loss_mlp": 0.0, "epoch": 0.7954982685648326, "flos": 1426005347328.0, "grad_norm": 0.013377465040040408, "language_loss": 0.75511783, "learning_rate": 0.00010572198917434018, "loss": 0.76518488, "num_input_tokens_seen": 343072496, "router_z_loss_mlp": 0.04541016, "routerloss_mlp": 0.0, "step": 4135, "time_per_iteration": 5.031512975692749 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051378, "balance_loss_mlp": 1.04224622, "diversity_loss_mlp": 0.0, "epoch": 0.7956906502500962, "flos": 389885197824.0, "grad_norm": 0.08143958467983652, "language_loss": 0.7928952, "learning_rate": 0.00010553047875229166, "loss": 0.80340898, "num_input_tokens_seen": 343136928, "router_z_loss_mlp": 0.09130859, "routerloss_mlp": 0.0, "step": 4136, "time_per_iteration": 2.536219596862793 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053745, "balance_loss_mlp": 1.04491794, "diversity_loss_mlp": 0.0, "epoch": 0.7958830319353598, "flos": 515573434368.0, "grad_norm": 0.05917621440441134, "language_loss": 0.8352496, "learning_rate": 0.00010533912147689328, "loss": 0.84578705, "num_input_tokens_seen": 343207440, "router_z_loss_mlp": 0.08837891, "routerloss_mlp": 0.0, "step": 4137, "time_per_iteration": 2.62947416305542 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052392, "balance_loss_mlp": 1.04364753, "diversity_loss_mlp": 0.0, "epoch": 0.7960754136206233, "flos": 493941390336.0, "grad_norm": 0.07247645097842569, "language_loss": 0.82383895, "learning_rate": 0.00010514791742243656, "loss": 0.83436286, "num_input_tokens_seen": 343273744, "router_z_loss_mlp": 0.08752441, "routerloss_mlp": 0.0, "step": 4138, "time_per_iteration": 2.6058223247528076 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053846, "balance_loss_mlp": 1.04486322, "diversity_loss_mlp": 0.0, "epoch": 0.7962677953058869, "flos": 655728182784.0, "grad_norm": 0.07856202151848143, "language_loss": 0.82678479, "learning_rate": 0.00010495686666315341, "loss": 0.83732331, "num_input_tokens_seen": 343357648, "router_z_loss_mlp": 0.08984375, "routerloss_mlp": 0.0, "step": 4139, "time_per_iteration": 2.8820180892944336 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053854, "balance_loss_mlp": 1.04509258, "diversity_loss_mlp": 0.0, "epoch": 0.7964601769911505, "flos": 542384335872.0, "grad_norm": 0.09207393340076041, "language_loss": 0.77504325, "learning_rate": 0.00010476596927321635, "loss": 0.78558183, "num_input_tokens_seen": 343425344, "router_z_loss_mlp": 0.08776855, "routerloss_mlp": 0.0, "step": 4140, "time_per_iteration": 2.5876264572143555 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054719, "balance_loss_mlp": 1.04586816, "diversity_loss_mlp": 0.0, "epoch": 0.796652558676414, "flos": 537650016768.0, "grad_norm": 0.06332389355869186, "language_loss": 0.80286723, "learning_rate": 0.00010457522532673835, "loss": 0.81341445, "num_input_tokens_seen": 343504960, "router_z_loss_mlp": 0.08862305, "routerloss_mlp": 0.0, "step": 4141, "time_per_iteration": 2.7853429317474365 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053842, "balance_loss_mlp": 1.04521155, "diversity_loss_mlp": 0.0, "epoch": 0.7968449403616775, "flos": 475091495424.0, "grad_norm": 0.07594916891501999, "language_loss": 0.83322799, "learning_rate": 0.00010438463489777272, "loss": 0.84376645, "num_input_tokens_seen": 343570832, "router_z_loss_mlp": 0.08642578, "routerloss_mlp": 0.0, "step": 4142, "time_per_iteration": 2.574995756149292 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053543, "balance_loss_mlp": 1.0441432, "diversity_loss_mlp": 0.0, "epoch": 0.7970373220469411, "flos": 567613827072.0, "grad_norm": 0.06219380630034642, "language_loss": 0.77388006, "learning_rate": 0.00010419419806031316, "loss": 0.78441548, "num_input_tokens_seen": 343639808, "router_z_loss_mlp": 0.09399414, "routerloss_mlp": 0.0, "step": 4143, "time_per_iteration": 2.681364059448242 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057205, "balance_loss_mlp": 1.04838395, "diversity_loss_mlp": 0.0, "epoch": 0.7972297037322047, "flos": 556208446464.0, "grad_norm": 0.06244291716660837, "language_loss": 0.83778638, "learning_rate": 0.00010400391488829403, "loss": 0.84835839, "num_input_tokens_seen": 343715232, "router_z_loss_mlp": 0.08831787, "routerloss_mlp": 0.0, "step": 4144, "time_per_iteration": 2.7661397457122803 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056681, "balance_loss_mlp": 1.04754949, "diversity_loss_mlp": 0.0, "epoch": 0.7974220854174683, "flos": 576180158976.0, "grad_norm": 0.056029857219710606, "language_loss": 0.86605, "learning_rate": 0.00010381378545558984, "loss": 0.87661684, "num_input_tokens_seen": 343787168, "router_z_loss_mlp": 0.09130859, "routerloss_mlp": 0.0, "step": 4145, "time_per_iteration": 2.706909656524658 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051079, "balance_loss_mlp": 1.04191816, "diversity_loss_mlp": 0.0, "epoch": 0.7976144671027319, "flos": 483069754368.0, "grad_norm": 0.06718577287314217, "language_loss": 0.84665811, "learning_rate": 0.00010362380983601505, "loss": 0.85716891, "num_input_tokens_seen": 343853600, "router_z_loss_mlp": 0.09161377, "routerloss_mlp": 0.0, "step": 4146, "time_per_iteration": 2.529480218887329 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055069, "balance_loss_mlp": 1.04609227, "diversity_loss_mlp": 0.0, "epoch": 0.7978068487879953, "flos": 1077865615872.0, "grad_norm": 0.0571367932207486, "language_loss": 0.7866556, "learning_rate": 0.00010343398810332477, "loss": 0.79720628, "num_input_tokens_seen": 343942816, "router_z_loss_mlp": 0.08978271, "routerloss_mlp": 0.0, "step": 4147, "time_per_iteration": 3.4586639404296875 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105507, "balance_loss_mlp": 1.04595661, "diversity_loss_mlp": 0.0, "epoch": 0.7979992304732589, "flos": 733739586048.0, "grad_norm": 0.07566676342485233, "language_loss": 0.84437156, "learning_rate": 0.00010324432033121467, "loss": 0.85492229, "num_input_tokens_seen": 344021232, "router_z_loss_mlp": 0.09118652, "routerloss_mlp": 0.0, "step": 4148, "time_per_iteration": 2.8839025497436523 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053366, "balance_loss_mlp": 1.04418659, "diversity_loss_mlp": 0.0, "epoch": 0.7981916121585225, "flos": 415774342656.0, "grad_norm": 0.06830192551222886, "language_loss": 0.83435208, "learning_rate": 0.00010305480659332005, "loss": 0.84488571, "num_input_tokens_seen": 344089616, "router_z_loss_mlp": 0.09179688, "routerloss_mlp": 0.0, "step": 4149, "time_per_iteration": 2.5951197147369385 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059283, "balance_loss_mlp": 1.05012214, "diversity_loss_mlp": 0.0, "epoch": 0.7983839938437861, "flos": 465257613312.0, "grad_norm": 0.07563453451103978, "language_loss": 0.83492422, "learning_rate": 0.00010286544696321682, "loss": 0.84551704, "num_input_tokens_seen": 344154992, "router_z_loss_mlp": 0.09161377, "routerloss_mlp": 0.0, "step": 4150, "time_per_iteration": 2.5118510723114014 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055471, "balance_loss_mlp": 1.04628563, "diversity_loss_mlp": 0.0, "epoch": 0.7985763755290496, "flos": 510567473664.0, "grad_norm": 0.07562833621575128, "language_loss": 0.7924732, "learning_rate": 0.00010267624151442073, "loss": 0.80302793, "num_input_tokens_seen": 344225232, "router_z_loss_mlp": 0.09179688, "routerloss_mlp": 0.0, "step": 4151, "time_per_iteration": 2.612138509750366 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052309, "balance_loss_mlp": 1.04312396, "diversity_loss_mlp": 0.0, "epoch": 0.7987687572143132, "flos": 1010649498624.0, "grad_norm": 0.07020647270289845, "language_loss": 0.80794007, "learning_rate": 0.000102487190320388, "loss": 0.81846315, "num_input_tokens_seen": 344309120, "router_z_loss_mlp": 0.09185791, "routerloss_mlp": 0.0, "step": 4152, "time_per_iteration": 3.3858306407928467 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052492, "balance_loss_mlp": 1.0432297, "diversity_loss_mlp": 0.0, "epoch": 0.7989611388995768, "flos": 1021078794240.0, "grad_norm": 0.08528953367031804, "language_loss": 0.79654646, "learning_rate": 0.00010229829345451475, "loss": 0.80707145, "num_input_tokens_seen": 344394112, "router_z_loss_mlp": 0.09259033, "routerloss_mlp": 0.0, "step": 4153, "time_per_iteration": 3.326597213745117 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056162, "balance_loss_mlp": 1.04706669, "diversity_loss_mlp": 0.0, "epoch": 0.7991535205848403, "flos": 1101338601984.0, "grad_norm": 0.06462141101761633, "language_loss": 0.79619837, "learning_rate": 0.00010210955099013724, "loss": 0.80676001, "num_input_tokens_seen": 344476512, "router_z_loss_mlp": 0.09106445, "routerloss_mlp": 0.0, "step": 4154, "time_per_iteration": 3.3817038536071777 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054824, "balance_loss_mlp": 1.04566312, "diversity_loss_mlp": 0.0, "epoch": 0.7993459022701039, "flos": 834818337792.0, "grad_norm": 0.07616557599778462, "language_loss": 0.76846623, "learning_rate": 0.00010192096300053167, "loss": 0.77901447, "num_input_tokens_seen": 344561088, "router_z_loss_mlp": 0.0916748, "routerloss_mlp": 0.0, "step": 4155, "time_per_iteration": 3.081740379333496 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105106, "balance_loss_mlp": 1.04188037, "diversity_loss_mlp": 0.0, "epoch": 0.7995382839553674, "flos": 522686836224.0, "grad_norm": 0.0612954553036602, "language_loss": 0.85157597, "learning_rate": 0.00010173252955891477, "loss": 0.86208659, "num_input_tokens_seen": 344639424, "router_z_loss_mlp": 0.09179688, "routerloss_mlp": 0.0, "step": 4156, "time_per_iteration": 2.7239129543304443 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055997, "balance_loss_mlp": 1.04709256, "diversity_loss_mlp": 0.0, "epoch": 0.799730665640631, "flos": 537820715520.0, "grad_norm": 0.07720224754254114, "language_loss": 0.73362273, "learning_rate": 0.00010154425073844253, "loss": 0.74418271, "num_input_tokens_seen": 344710048, "router_z_loss_mlp": 0.08917236, "routerloss_mlp": 0.0, "step": 4157, "time_per_iteration": 2.696467638015747 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052019, "balance_loss_mlp": 1.04316235, "diversity_loss_mlp": 0.0, "epoch": 0.7999230473258946, "flos": 505060075008.0, "grad_norm": 0.060505733748086536, "language_loss": 0.82517296, "learning_rate": 0.00010135612661221138, "loss": 0.83569312, "num_input_tokens_seen": 344776832, "router_z_loss_mlp": 0.08862305, "routerloss_mlp": 0.0, "step": 4158, "time_per_iteration": 2.582913398742676 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047951, "balance_loss_mlp": 1.03880203, "diversity_loss_mlp": 0.0, "epoch": 0.8001154290111582, "flos": 1027342393344.0, "grad_norm": 0.08198302238912947, "language_loss": 0.81945235, "learning_rate": 0.00010116815725325751, "loss": 0.82993186, "num_input_tokens_seen": 344864928, "router_z_loss_mlp": 0.09143066, "routerloss_mlp": 0.0, "step": 4159, "time_per_iteration": 3.28433895111084 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00798548, "balance_loss_mlp": 1.34939909, "diversity_loss_mlp": 0.22584054, "epoch": 0.8003078106964217, "flos": 750906754560.0, "grad_norm": 0.032371691049230863, "language_loss": 0.80472159, "learning_rate": 0.00010098034273455725, "loss": 0.81270707, "num_input_tokens_seen": 344944048, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01092844, "step": 4160, "time_per_iteration": 3.020301342010498 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047481, "balance_loss_mlp": 1.03802133, "diversity_loss_mlp": 0.0, "epoch": 0.8005001923816852, "flos": 488465925120.0, "grad_norm": 0.06923738075728161, "language_loss": 0.79914421, "learning_rate": 0.00010079268312902662, "loss": 0.80961907, "num_input_tokens_seen": 345015392, "router_z_loss_mlp": 0.09448242, "routerloss_mlp": 0.0, "step": 4161, "time_per_iteration": 2.663827657699585 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053373, "balance_loss_mlp": 1.04445577, "diversity_loss_mlp": 0.0, "epoch": 0.8006925740669488, "flos": 513248306688.0, "grad_norm": 0.07955090405050065, "language_loss": 0.82002842, "learning_rate": 0.0001006051785095215, "loss": 0.83056211, "num_input_tokens_seen": 345086640, "router_z_loss_mlp": 0.08917236, "routerloss_mlp": 0.0, "step": 4162, "time_per_iteration": 2.669938087463379 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052136, "balance_loss_mlp": 1.04306972, "diversity_loss_mlp": 0.0, "epoch": 0.8008849557522124, "flos": 578529879552.0, "grad_norm": 0.07737392704066832, "language_loss": 0.79858398, "learning_rate": 0.0001004178289488376, "loss": 0.80910534, "num_input_tokens_seen": 345159616, "router_z_loss_mlp": 0.09069824, "routerloss_mlp": 0.0, "step": 4163, "time_per_iteration": 2.7215919494628906 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052219, "balance_loss_mlp": 1.04284358, "diversity_loss_mlp": 0.0, "epoch": 0.801077337437476, "flos": 478708766208.0, "grad_norm": 0.06994031793136987, "language_loss": 0.83999282, "learning_rate": 0.0001002306345197106, "loss": 0.85051501, "num_input_tokens_seen": 345225536, "router_z_loss_mlp": 0.09368896, "routerloss_mlp": 0.0, "step": 4164, "time_per_iteration": 2.545501708984375 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049385, "balance_loss_mlp": 1.04034317, "diversity_loss_mlp": 0.0, "epoch": 0.8012697191227395, "flos": 676700573184.0, "grad_norm": 0.07265204276246538, "language_loss": 0.80238962, "learning_rate": 0.00010004359529481571, "loss": 0.81288344, "num_input_tokens_seen": 345302960, "router_z_loss_mlp": 0.0904541, "routerloss_mlp": 0.0, "step": 4165, "time_per_iteration": 2.8751044273376465 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049571, "balance_loss_mlp": 1.04052877, "diversity_loss_mlp": 0.0, "epoch": 0.8014621008080031, "flos": 1295132405760.0, "grad_norm": 0.07344708402099766, "language_loss": 0.82382286, "learning_rate": 9.985671134676804e-05, "loss": 0.83431858, "num_input_tokens_seen": 345397792, "router_z_loss_mlp": 0.0904541, "routerloss_mlp": 0.0, "step": 4166, "time_per_iteration": 3.706587314605713 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051894, "balance_loss_mlp": 1.04301274, "diversity_loss_mlp": 0.0, "epoch": 0.8016544824932667, "flos": 511827683328.0, "grad_norm": 0.0782603427027698, "language_loss": 0.83461916, "learning_rate": 9.966998274812234e-05, "loss": 0.84513807, "num_input_tokens_seen": 345465440, "router_z_loss_mlp": 0.08898926, "routerloss_mlp": 0.0, "step": 4167, "time_per_iteration": 2.5965118408203125 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050163, "balance_loss_mlp": 1.04132366, "diversity_loss_mlp": 0.0, "epoch": 0.8018468641785302, "flos": 535690879488.0, "grad_norm": 0.08470873380508834, "language_loss": 0.81762064, "learning_rate": 9.948340957137308e-05, "loss": 0.82812226, "num_input_tokens_seen": 345533072, "router_z_loss_mlp": 0.08843994, "routerloss_mlp": 0.0, "step": 4168, "time_per_iteration": 2.6369173526763916 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053937, "balance_loss_mlp": 1.04494286, "diversity_loss_mlp": 0.0, "epoch": 0.8020392458637937, "flos": 1023431086080.0, "grad_norm": 0.07955948845391579, "language_loss": 0.79946613, "learning_rate": 9.929699188895447e-05, "loss": 0.81000549, "num_input_tokens_seen": 345622208, "router_z_loss_mlp": 0.09002686, "routerloss_mlp": 0.0, "step": 4169, "time_per_iteration": 3.257819652557373 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00590619, "balance_loss_mlp": 1.02878523, "diversity_loss_mlp": 0.13400336, "epoch": 0.8022316275490573, "flos": 1561806821376.0, "grad_norm": 0.001271365187533197, "language_loss": 0.78054404, "learning_rate": 9.911072977324009e-05, "loss": 0.78645021, "num_input_tokens_seen": 345852544, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.00922488, "step": 4170, "time_per_iteration": 4.967956066131592 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052767, "balance_loss_mlp": 1.04368353, "diversity_loss_mlp": 0.0, "epoch": 0.8024240092343209, "flos": 420698810880.0, "grad_norm": 0.06699330376146911, "language_loss": 0.83303684, "learning_rate": 9.89246232965435e-05, "loss": 0.84356451, "num_input_tokens_seen": 345917328, "router_z_loss_mlp": 0.09088135, "routerloss_mlp": 0.0, "step": 4171, "time_per_iteration": 2.511323928833008 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053852, "balance_loss_mlp": 1.04476857, "diversity_loss_mlp": 0.0, "epoch": 0.8026163909195845, "flos": 763836645888.0, "grad_norm": 0.0707874133261092, "language_loss": 0.7890135, "learning_rate": 9.873867253111762e-05, "loss": 0.79955202, "num_input_tokens_seen": 345995936, "router_z_loss_mlp": 0.09088135, "routerloss_mlp": 0.0, "step": 4172, "time_per_iteration": 2.938361644744873 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01002455, "balance_loss_mlp": 0.99778163, "diversity_loss_mlp": 0.0, "epoch": 0.8028087726048481, "flos": 1518861362688.0, "grad_norm": 0.01094338931973828, "language_loss": 0.80264562, "learning_rate": 9.855287754915503e-05, "loss": 0.81267017, "num_input_tokens_seen": 346232720, "router_z_loss_mlp": 0.04663086, "routerloss_mlp": 0.0, "step": 4173, "time_per_iteration": 4.908462285995483 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00793014, "balance_loss_mlp": 1.33927226, "diversity_loss_mlp": 0.22488941, "epoch": 0.8030011542901115, "flos": 517861486080.0, "grad_norm": 0.03516130293682118, "language_loss": 0.88785201, "learning_rate": 9.836723842278733e-05, "loss": 0.89578211, "num_input_tokens_seen": 346298208, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01093344, "step": 4174, "time_per_iteration": 2.5922460556030273 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053625, "balance_loss_mlp": 1.04467213, "diversity_loss_mlp": 0.0, "epoch": 0.8031935359753751, "flos": 545616165888.0, "grad_norm": 0.07944554575907646, "language_loss": 0.78243375, "learning_rate": 9.818175522408646e-05, "loss": 0.79296994, "num_input_tokens_seen": 346370080, "router_z_loss_mlp": 0.08953857, "routerloss_mlp": 0.0, "step": 4175, "time_per_iteration": 2.6601076126098633 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051864, "balance_loss_mlp": 1.04280424, "diversity_loss_mlp": 0.0, "epoch": 0.8033859176606387, "flos": 603559309824.0, "grad_norm": 0.06387478026678979, "language_loss": 0.84549594, "learning_rate": 9.79964280250632e-05, "loss": 0.85601461, "num_input_tokens_seen": 346442432, "router_z_loss_mlp": 0.09063721, "routerloss_mlp": 0.0, "step": 4176, "time_per_iteration": 2.7655818462371826 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049571, "balance_loss_mlp": 1.0406065, "diversity_loss_mlp": 0.0, "epoch": 0.8035782993459023, "flos": 565859520000.0, "grad_norm": 0.07434715811474918, "language_loss": 0.81265736, "learning_rate": 9.781125689766795e-05, "loss": 0.82315314, "num_input_tokens_seen": 346513088, "router_z_loss_mlp": 0.08966064, "routerloss_mlp": 0.0, "step": 4177, "time_per_iteration": 2.7365646362304688 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051174, "balance_loss_mlp": 1.04198945, "diversity_loss_mlp": 0.0, "epoch": 0.8037706810311658, "flos": 538435952640.0, "grad_norm": 0.0854183247343152, "language_loss": 0.84699386, "learning_rate": 9.762624191379054e-05, "loss": 0.85750556, "num_input_tokens_seen": 346581376, "router_z_loss_mlp": 0.09179688, "routerloss_mlp": 0.0, "step": 4178, "time_per_iteration": 2.6607935428619385 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047249, "balance_loss_mlp": 1.03811717, "diversity_loss_mlp": 0.0, "epoch": 0.8039630627164294, "flos": 515187993600.0, "grad_norm": 0.07548014236337308, "language_loss": 0.79687864, "learning_rate": 9.744138314526014e-05, "loss": 0.80735117, "num_input_tokens_seen": 346653328, "router_z_loss_mlp": 0.09130859, "routerloss_mlp": 0.0, "step": 4179, "time_per_iteration": 2.649068593978882 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01001844, "balance_loss_mlp": 0.99719512, "diversity_loss_mlp": 0.0, "epoch": 0.804155444401693, "flos": 1478834247168.0, "grad_norm": 0.010296775940752873, "language_loss": 0.74733561, "learning_rate": 9.725668066384535e-05, "loss": 0.75735408, "num_input_tokens_seen": 346873264, "router_z_loss_mlp": 0.04638672, "routerloss_mlp": 0.0, "step": 4180, "time_per_iteration": 4.874431133270264 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050118, "balance_loss_mlp": 1.04090953, "diversity_loss_mlp": 0.0, "epoch": 0.8043478260869565, "flos": 521164896768.0, "grad_norm": 0.07453821883084652, "language_loss": 0.77098471, "learning_rate": 9.707213454125396e-05, "loss": 0.78148586, "num_input_tokens_seen": 346946272, "router_z_loss_mlp": 0.09204102, "routerloss_mlp": 0.0, "step": 4181, "time_per_iteration": 2.687908887863159 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045921, "balance_loss_mlp": 1.03656304, "diversity_loss_mlp": 0.0, "epoch": 0.8045402077722201, "flos": 545448038400.0, "grad_norm": 0.06056113889476793, "language_loss": 0.80571556, "learning_rate": 9.688774484913298e-05, "loss": 0.81617486, "num_input_tokens_seen": 347024048, "router_z_loss_mlp": 0.09356689, "routerloss_mlp": 0.0, "step": 4182, "time_per_iteration": 2.755779981613159 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054246, "balance_loss_mlp": 1.04540682, "diversity_loss_mlp": 0.0, "epoch": 0.8047325894574836, "flos": 678388068864.0, "grad_norm": 0.07500472983981471, "language_loss": 0.7412895, "learning_rate": 9.670351165906921e-05, "loss": 0.75183195, "num_input_tokens_seen": 347108736, "router_z_loss_mlp": 0.08850098, "routerloss_mlp": 0.0, "step": 4183, "time_per_iteration": 2.959167242050171 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046099, "balance_loss_mlp": 1.03698587, "diversity_loss_mlp": 0.0, "epoch": 0.8049249711427472, "flos": 587227262976.0, "grad_norm": 0.07263280839339305, "language_loss": 0.78791356, "learning_rate": 9.65194350425882e-05, "loss": 0.79837459, "num_input_tokens_seen": 347184192, "router_z_loss_mlp": 0.09118652, "routerloss_mlp": 0.0, "step": 4184, "time_per_iteration": 2.7201614379882812 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049152, "balance_loss_mlp": 1.0401814, "diversity_loss_mlp": 0.0, "epoch": 0.8051173528280108, "flos": 814194312192.0, "grad_norm": 0.0782100616306692, "language_loss": 0.77473164, "learning_rate": 9.633551507115452e-05, "loss": 0.78522313, "num_input_tokens_seen": 347282336, "router_z_loss_mlp": 0.08966064, "routerloss_mlp": 0.0, "step": 4185, "time_per_iteration": 3.134634256362915 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010494, "balance_loss_mlp": 1.04034662, "diversity_loss_mlp": 0.0, "epoch": 0.8053097345132744, "flos": 725687175168.0, "grad_norm": 0.06922447607886563, "language_loss": 0.77592742, "learning_rate": 9.615175181617259e-05, "loss": 0.78642142, "num_input_tokens_seen": 347364800, "router_z_loss_mlp": 0.09063721, "routerloss_mlp": 0.0, "step": 4186, "time_per_iteration": 2.961618423461914 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051819, "balance_loss_mlp": 1.04297376, "diversity_loss_mlp": 0.0, "epoch": 0.805502116198538, "flos": 748050453504.0, "grad_norm": 0.0745309975524961, "language_loss": 0.81570286, "learning_rate": 9.596814534898552e-05, "loss": 0.82622111, "num_input_tokens_seen": 347443328, "router_z_loss_mlp": 0.08856201, "routerloss_mlp": 0.0, "step": 4187, "time_per_iteration": 2.9941747188568115 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050277, "balance_loss_mlp": 1.04128897, "diversity_loss_mlp": 0.0, "epoch": 0.8056944978838014, "flos": 640258421760.0, "grad_norm": 0.06519286758654869, "language_loss": 0.87670028, "learning_rate": 9.578469574087561e-05, "loss": 0.88720298, "num_input_tokens_seen": 347522064, "router_z_loss_mlp": 0.08990479, "routerloss_mlp": 0.0, "step": 4188, "time_per_iteration": 2.8933184146881104 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049222, "balance_loss_mlp": 1.04018009, "diversity_loss_mlp": 0.0, "epoch": 0.805886879569065, "flos": 644631892992.0, "grad_norm": 0.07111853308758409, "language_loss": 0.78227425, "learning_rate": 9.560140306306436e-05, "loss": 0.79276645, "num_input_tokens_seen": 347597200, "router_z_loss_mlp": 0.0904541, "routerloss_mlp": 0.0, "step": 4189, "time_per_iteration": 2.8829870223999023 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050789, "balance_loss_mlp": 1.0420208, "diversity_loss_mlp": 0.0, "epoch": 0.8060792612543286, "flos": 661230812160.0, "grad_norm": 0.07715619542299273, "language_loss": 0.81660378, "learning_rate": 9.541826738671233e-05, "loss": 0.8271116, "num_input_tokens_seen": 347676928, "router_z_loss_mlp": 0.08776855, "routerloss_mlp": 0.0, "step": 4190, "time_per_iteration": 2.805797815322876 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050431, "balance_loss_mlp": 1.041592, "diversity_loss_mlp": 0.0, "epoch": 0.8062716429395922, "flos": 455075366400.0, "grad_norm": 0.07784281121647556, "language_loss": 0.82554364, "learning_rate": 9.523528878291904e-05, "loss": 0.83604801, "num_input_tokens_seen": 347741552, "router_z_loss_mlp": 0.08843994, "routerloss_mlp": 0.0, "step": 4191, "time_per_iteration": 2.555079460144043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055942, "balance_loss_mlp": 1.0468998, "diversity_loss_mlp": 0.0, "epoch": 0.8064640246248557, "flos": 526407994368.0, "grad_norm": 0.08129119625333912, "language_loss": 0.85176903, "learning_rate": 9.50524673227231e-05, "loss": 0.86232841, "num_input_tokens_seen": 347807008, "router_z_loss_mlp": 0.09039307, "routerloss_mlp": 0.0, "step": 4192, "time_per_iteration": 2.616278648376465 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057227, "balance_loss_mlp": 1.04823291, "diversity_loss_mlp": 0.0, "epoch": 0.8066564063101193, "flos": 865115458560.0, "grad_norm": 0.06195550147591559, "language_loss": 0.8222602, "learning_rate": 9.486980307710208e-05, "loss": 0.83283252, "num_input_tokens_seen": 347895728, "router_z_loss_mlp": 0.09002686, "routerloss_mlp": 0.0, "step": 4193, "time_per_iteration": 3.1774582862854004 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055176, "balance_loss_mlp": 1.04616332, "diversity_loss_mlp": 0.0, "epoch": 0.8068487879953828, "flos": 530536614912.0, "grad_norm": 0.07492247011829438, "language_loss": 0.82230604, "learning_rate": 9.468729611697246e-05, "loss": 0.83285773, "num_input_tokens_seen": 347970368, "router_z_loss_mlp": 0.09014893, "routerloss_mlp": 0.0, "step": 4194, "time_per_iteration": 2.711758613586426 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105404, "balance_loss_mlp": 1.04514122, "diversity_loss_mlp": 0.0, "epoch": 0.8070411696806464, "flos": 566183291904.0, "grad_norm": 0.05932556750810355, "language_loss": 0.81710708, "learning_rate": 9.450494651319003e-05, "loss": 0.82764751, "num_input_tokens_seen": 348039040, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 4195, "time_per_iteration": 2.6608495712280273 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058947, "balance_loss_mlp": 1.04997635, "diversity_loss_mlp": 0.0, "epoch": 0.80723355136591, "flos": 986591010816.0, "grad_norm": 0.063085164329588, "language_loss": 0.79428887, "learning_rate": 9.432275433654885e-05, "loss": 0.80487841, "num_input_tokens_seen": 348126064, "router_z_loss_mlp": 0.08978271, "routerloss_mlp": 0.0, "step": 4196, "time_per_iteration": 3.337599515914917 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058486, "balance_loss_mlp": 1.04924726, "diversity_loss_mlp": 0.0, "epoch": 0.8074259330511735, "flos": 566961513984.0, "grad_norm": 0.06810941123985487, "language_loss": 0.82549566, "learning_rate": 9.414071965778221e-05, "loss": 0.83608055, "num_input_tokens_seen": 348205888, "router_z_loss_mlp": 0.09246826, "routerloss_mlp": 0.0, "step": 4197, "time_per_iteration": 2.8500421047210693 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00793266, "balance_loss_mlp": 1.33856153, "diversity_loss_mlp": 0.22554049, "epoch": 0.8076183147364371, "flos": 494662712832.0, "grad_norm": 0.030004109162440378, "language_loss": 0.80021191, "learning_rate": 9.395884254756242e-05, "loss": 0.80814457, "num_input_tokens_seen": 348278608, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01121513, "step": 4198, "time_per_iteration": 2.7939352989196777 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01061077, "balance_loss_mlp": 1.05221987, "diversity_loss_mlp": 0.0, "epoch": 0.8078106964217007, "flos": 420011993088.0, "grad_norm": 0.07237334672543508, "language_loss": 0.79747534, "learning_rate": 9.377712307650044e-05, "loss": 0.80808604, "num_input_tokens_seen": 348341312, "router_z_loss_mlp": 0.08856201, "routerloss_mlp": 0.0, "step": 4199, "time_per_iteration": 2.616584300994873 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060422, "balance_loss_mlp": 1.05148149, "diversity_loss_mlp": 0.0, "epoch": 0.8080030781069643, "flos": 527537152512.0, "grad_norm": 0.07529347845483464, "language_loss": 0.83181953, "learning_rate": 9.359556131514602e-05, "loss": 0.8424238, "num_input_tokens_seen": 348409184, "router_z_loss_mlp": 0.08953857, "routerloss_mlp": 0.0, "step": 4200, "time_per_iteration": 2.6320338249206543 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00788939, "balance_loss_mlp": 1.33364224, "diversity_loss_mlp": 0.22200125, "epoch": 0.8081954597922277, "flos": 544148554752.0, "grad_norm": 0.03126306975747278, "language_loss": 0.8159976, "learning_rate": 9.341415733398733e-05, "loss": 0.82388693, "num_input_tokens_seen": 348480832, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01111754, "step": 4201, "time_per_iteration": 2.725898265838623 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060854, "balance_loss_mlp": 1.05191302, "diversity_loss_mlp": 0.0, "epoch": 0.8083878414774913, "flos": 640900823040.0, "grad_norm": 0.07028300429625041, "language_loss": 0.75730419, "learning_rate": 9.323291120345207e-05, "loss": 0.76791275, "num_input_tokens_seen": 348559232, "router_z_loss_mlp": 0.08947754, "routerloss_mlp": 0.0, "step": 4202, "time_per_iteration": 2.858754873275757 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057987, "balance_loss_mlp": 1.04905808, "diversity_loss_mlp": 0.0, "epoch": 0.8085802231627549, "flos": 705614146560.0, "grad_norm": 0.07410213802766576, "language_loss": 0.72826529, "learning_rate": 9.305182299390614e-05, "loss": 0.73884517, "num_input_tokens_seen": 348638960, "router_z_loss_mlp": 0.08935547, "routerloss_mlp": 0.0, "step": 4203, "time_per_iteration": 2.910843849182129 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053797, "balance_loss_mlp": 1.04489827, "diversity_loss_mlp": 0.0, "epoch": 0.8087726048480185, "flos": 419762373120.0, "grad_norm": 0.07872218498382196, "language_loss": 0.88753879, "learning_rate": 9.287089277565409e-05, "loss": 0.89807671, "num_input_tokens_seen": 348704816, "router_z_loss_mlp": 0.08898926, "routerloss_mlp": 0.0, "step": 4204, "time_per_iteration": 2.531914234161377 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059733, "balance_loss_mlp": 1.05073869, "diversity_loss_mlp": 0.0, "epoch": 0.8089649865332821, "flos": 508766178816.0, "grad_norm": 0.05750820164302825, "language_loss": 0.87048918, "learning_rate": 9.269012061893922e-05, "loss": 0.88108647, "num_input_tokens_seen": 348783504, "router_z_loss_mlp": 0.09002686, "routerloss_mlp": 0.0, "step": 4205, "time_per_iteration": 2.7968151569366455 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052186, "balance_loss_mlp": 1.04308999, "diversity_loss_mlp": 0.0, "epoch": 0.8091573682185456, "flos": 457219883520.0, "grad_norm": 0.06433103951625496, "language_loss": 0.8483271, "learning_rate": 9.250950659394386e-05, "loss": 0.85884893, "num_input_tokens_seen": 348858272, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 4206, "time_per_iteration": 2.665961742401123 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050706, "balance_loss_mlp": 1.04172313, "diversity_loss_mlp": 0.0, "epoch": 0.8093497499038091, "flos": 525256441344.0, "grad_norm": 0.0784365412189913, "language_loss": 0.77137649, "learning_rate": 9.232905077078824e-05, "loss": 0.7818836, "num_input_tokens_seen": 348934432, "router_z_loss_mlp": 0.08978271, "routerloss_mlp": 0.0, "step": 4207, "time_per_iteration": 2.7918972969055176 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105439, "balance_loss_mlp": 1.04530609, "diversity_loss_mlp": 0.0, "epoch": 0.8095421315890727, "flos": 489617478144.0, "grad_norm": 0.07290792729834863, "language_loss": 0.76617867, "learning_rate": 9.214875321953164e-05, "loss": 0.77672255, "num_input_tokens_seen": 349003856, "router_z_loss_mlp": 0.09075928, "routerloss_mlp": 0.0, "step": 4208, "time_per_iteration": 2.6330010890960693 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056212, "balance_loss_mlp": 1.04722369, "diversity_loss_mlp": 0.0, "epoch": 0.8097345132743363, "flos": 625109861376.0, "grad_norm": 0.06967828145804263, "language_loss": 0.81180429, "learning_rate": 9.196861401017164e-05, "loss": 0.82236642, "num_input_tokens_seen": 349080544, "router_z_loss_mlp": 0.08990479, "routerloss_mlp": 0.0, "step": 4209, "time_per_iteration": 2.8048768043518066 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053814, "balance_loss_mlp": 1.04471278, "diversity_loss_mlp": 0.0, "epoch": 0.8099268949595998, "flos": 615688584192.0, "grad_norm": 0.08832200116465504, "language_loss": 0.79589164, "learning_rate": 9.178863321264475e-05, "loss": 0.8064298, "num_input_tokens_seen": 349159072, "router_z_loss_mlp": 0.09106445, "routerloss_mlp": 0.0, "step": 4210, "time_per_iteration": 2.775315046310425 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053364, "balance_loss_mlp": 1.04430985, "diversity_loss_mlp": 0.0, "epoch": 0.8101192766448634, "flos": 479642632704.0, "grad_norm": 0.05749425026246104, "language_loss": 0.79754937, "learning_rate": 9.160881089682566e-05, "loss": 0.80808306, "num_input_tokens_seen": 349230176, "router_z_loss_mlp": 0.09051514, "routerloss_mlp": 0.0, "step": 4211, "time_per_iteration": 2.6467440128326416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051728, "balance_loss_mlp": 1.04233456, "diversity_loss_mlp": 0.0, "epoch": 0.810311658330127, "flos": 517327741440.0, "grad_norm": 0.06468521234127066, "language_loss": 0.8684355, "learning_rate": 9.142914713252725e-05, "loss": 0.87895274, "num_input_tokens_seen": 349299760, "router_z_loss_mlp": 0.09393311, "routerloss_mlp": 0.0, "step": 4212, "time_per_iteration": 2.6296494007110596 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051493, "balance_loss_mlp": 1.04236174, "diversity_loss_mlp": 0.0, "epoch": 0.8105040400153906, "flos": 575782235136.0, "grad_norm": 0.05999607560391635, "language_loss": 0.84117031, "learning_rate": 9.124964198950159e-05, "loss": 0.85168523, "num_input_tokens_seen": 349379712, "router_z_loss_mlp": 0.09136963, "routerloss_mlp": 0.0, "step": 4213, "time_per_iteration": 2.834974527359009 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048457, "balance_loss_mlp": 1.03935528, "diversity_loss_mlp": 0.0, "epoch": 0.8106964217006541, "flos": 638963707392.0, "grad_norm": 0.07539161755647025, "language_loss": 0.85083151, "learning_rate": 9.107029553743862e-05, "loss": 0.86131608, "num_input_tokens_seen": 349460320, "router_z_loss_mlp": 0.09100342, "routerloss_mlp": 0.0, "step": 4214, "time_per_iteration": 2.8861420154571533 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053208, "balance_loss_mlp": 1.04424381, "diversity_loss_mlp": 0.0, "epoch": 0.8108888033859176, "flos": 579505964544.0, "grad_norm": 0.07165268891230793, "language_loss": 0.81364369, "learning_rate": 9.089110784596672e-05, "loss": 0.82417578, "num_input_tokens_seen": 349527648, "router_z_loss_mlp": 0.08972168, "routerloss_mlp": 0.0, "step": 4215, "time_per_iteration": 2.6690080165863037 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047412, "balance_loss_mlp": 1.03829873, "diversity_loss_mlp": 0.0, "epoch": 0.8110811850711812, "flos": 559907209728.0, "grad_norm": 0.05808229124837682, "language_loss": 0.83832216, "learning_rate": 9.071207898465284e-05, "loss": 0.84879631, "num_input_tokens_seen": 349606912, "router_z_loss_mlp": 0.09118652, "routerloss_mlp": 0.0, "step": 4216, "time_per_iteration": 2.8289334774017334 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01012526, "balance_loss_mlp": 1.00782871, "diversity_loss_mlp": 0.0, "epoch": 0.8112735667564448, "flos": 1517939979264.0, "grad_norm": 0.01559500500099235, "language_loss": 0.77260417, "learning_rate": 9.053320902300205e-05, "loss": 0.78272945, "num_input_tokens_seen": 349827040, "router_z_loss_mlp": 0.046875, "routerloss_mlp": 0.0, "step": 4217, "time_per_iteration": 4.674102067947388 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051919, "balance_loss_mlp": 1.04281104, "diversity_loss_mlp": 0.0, "epoch": 0.8114659484417084, "flos": 616340897280.0, "grad_norm": 0.07154355832559847, "language_loss": 0.85079706, "learning_rate": 9.035449803045792e-05, "loss": 0.86131632, "num_input_tokens_seen": 349900080, "router_z_loss_mlp": 0.09106445, "routerloss_mlp": 0.0, "step": 4218, "time_per_iteration": 2.8154706954956055 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043945, "balance_loss_mlp": 1.03502214, "diversity_loss_mlp": 0.0, "epoch": 0.8116583301269719, "flos": 649951340544.0, "grad_norm": 0.06078221490906587, "language_loss": 0.79071403, "learning_rate": 9.017594607640211e-05, "loss": 0.80115348, "num_input_tokens_seen": 349983568, "router_z_loss_mlp": 0.0892334, "routerloss_mlp": 0.0, "step": 4219, "time_per_iteration": 2.9709677696228027 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047615, "balance_loss_mlp": 1.03838241, "diversity_loss_mlp": 0.0, "epoch": 0.8118507118122354, "flos": 553087844352.0, "grad_norm": 0.07350013125355677, "language_loss": 0.80881071, "learning_rate": 8.999755323015463e-05, "loss": 0.81928694, "num_input_tokens_seen": 350054928, "router_z_loss_mlp": 0.09222412, "routerloss_mlp": 0.0, "step": 4220, "time_per_iteration": 2.7022857666015625 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046017, "balance_loss_mlp": 1.03677237, "diversity_loss_mlp": 0.0, "epoch": 0.812043093497499, "flos": 544118819328.0, "grad_norm": 0.06142059768116679, "language_loss": 0.87557077, "learning_rate": 8.981931956097384e-05, "loss": 0.88603091, "num_input_tokens_seen": 350127872, "router_z_loss_mlp": 0.0925293, "routerloss_mlp": 0.0, "step": 4221, "time_per_iteration": 2.637735366821289 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052382, "balance_loss_mlp": 1.04335153, "diversity_loss_mlp": 0.0, "epoch": 0.8122354751827626, "flos": 583404788736.0, "grad_norm": 0.06689891729172881, "language_loss": 0.83563554, "learning_rate": 8.964124513805628e-05, "loss": 0.84615934, "num_input_tokens_seen": 350206592, "router_z_loss_mlp": 0.09033203, "routerloss_mlp": 0.0, "step": 4222, "time_per_iteration": 2.792409658432007 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01010868, "balance_loss_mlp": 1.00612342, "diversity_loss_mlp": 0.0, "epoch": 0.8124278568680262, "flos": 1530568120320.0, "grad_norm": 0.013920089604171917, "language_loss": 0.78250074, "learning_rate": 8.94633300305363e-05, "loss": 0.79260939, "num_input_tokens_seen": 350436048, "router_z_loss_mlp": 0.04736328, "routerloss_mlp": 0.0, "step": 4223, "time_per_iteration": 4.96152138710022 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051266, "balance_loss_mlp": 1.04209328, "diversity_loss_mlp": 0.0, "epoch": 0.8126202385532897, "flos": 432865161216.0, "grad_norm": 0.07751812943068913, "language_loss": 0.8010273, "learning_rate": 8.928557430748668e-05, "loss": 0.81153995, "num_input_tokens_seen": 350501376, "router_z_loss_mlp": 0.0916748, "routerloss_mlp": 0.0, "step": 4224, "time_per_iteration": 2.6411619186401367 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01010841, "balance_loss_mlp": 1.00612068, "diversity_loss_mlp": 0.0, "epoch": 0.8128126202385533, "flos": 1547905987584.0, "grad_norm": 0.013617776499522711, "language_loss": 0.76495624, "learning_rate": 8.910797803791854e-05, "loss": 0.77506471, "num_input_tokens_seen": 350735232, "router_z_loss_mlp": 0.04711914, "routerloss_mlp": 0.0, "step": 4225, "time_per_iteration": 4.849999904632568 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047849, "balance_loss_mlp": 1.03853297, "diversity_loss_mlp": 0.0, "epoch": 0.8130050019238169, "flos": 528317945856.0, "grad_norm": 0.06825415899254728, "language_loss": 0.88826978, "learning_rate": 8.893054129078077e-05, "loss": 0.89874828, "num_input_tokens_seen": 350805088, "router_z_loss_mlp": 0.09313965, "routerloss_mlp": 0.0, "step": 4226, "time_per_iteration": 2.6051902770996094 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104715, "balance_loss_mlp": 1.03806627, "diversity_loss_mlp": 0.0, "epoch": 0.8131973836090804, "flos": 543125481984.0, "grad_norm": 0.07913354085389648, "language_loss": 0.80409497, "learning_rate": 8.875326413496037e-05, "loss": 0.81456649, "num_input_tokens_seen": 350876896, "router_z_loss_mlp": 0.09088135, "routerloss_mlp": 0.0, "step": 4227, "time_per_iteration": 2.709742307662964 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046017, "balance_loss_mlp": 1.03684425, "diversity_loss_mlp": 0.0, "epoch": 0.8133897652943439, "flos": 576494019072.0, "grad_norm": 0.11840379948544452, "language_loss": 0.82457888, "learning_rate": 8.857614663928249e-05, "loss": 0.83503902, "num_input_tokens_seen": 350948400, "router_z_loss_mlp": 0.09161377, "routerloss_mlp": 0.0, "step": 4228, "time_per_iteration": 2.6976981163024902 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051797, "balance_loss_mlp": 1.04245067, "diversity_loss_mlp": 0.0, "epoch": 0.8135821469796075, "flos": 579219268608.0, "grad_norm": 0.077990176521043, "language_loss": 0.78880024, "learning_rate": 8.839918887251025e-05, "loss": 0.79931819, "num_input_tokens_seen": 351023328, "router_z_loss_mlp": 0.09344482, "routerloss_mlp": 0.0, "step": 4229, "time_per_iteration": 2.7945659160614014 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105256, "balance_loss_mlp": 1.04340506, "diversity_loss_mlp": 0.0, "epoch": 0.8137745286648711, "flos": 650346693120.0, "grad_norm": 0.06092121648139386, "language_loss": 0.84136802, "learning_rate": 8.822239090334472e-05, "loss": 0.8518936, "num_input_tokens_seen": 351108672, "router_z_loss_mlp": 0.0916748, "routerloss_mlp": 0.0, "step": 4230, "time_per_iteration": 2.946951389312744 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047623, "balance_loss_mlp": 1.03831291, "diversity_loss_mlp": 0.0, "epoch": 0.8139669103501347, "flos": 701888219136.0, "grad_norm": 0.06877906362209742, "language_loss": 0.75546557, "learning_rate": 8.804575280042493e-05, "loss": 0.7659418, "num_input_tokens_seen": 351185056, "router_z_loss_mlp": 0.09307861, "routerloss_mlp": 0.0, "step": 4231, "time_per_iteration": 2.8897807598114014 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051697, "balance_loss_mlp": 1.04225588, "diversity_loss_mlp": 0.0, "epoch": 0.8141592920353983, "flos": 650223355392.0, "grad_norm": 0.07632389877762422, "language_loss": 0.82944, "learning_rate": 8.786927463232774e-05, "loss": 0.839957, "num_input_tokens_seen": 351255856, "router_z_loss_mlp": 0.09442139, "routerloss_mlp": 0.0, "step": 4232, "time_per_iteration": 2.755648374557495 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052198, "balance_loss_mlp": 1.04287577, "diversity_loss_mlp": 0.0, "epoch": 0.8143516737206618, "flos": 536829949440.0, "grad_norm": 0.07245949865511514, "language_loss": 0.81604928, "learning_rate": 8.769295646756853e-05, "loss": 0.82657123, "num_input_tokens_seen": 351322336, "router_z_loss_mlp": 0.09313965, "routerloss_mlp": 0.0, "step": 4233, "time_per_iteration": 2.573910713195801 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048453, "balance_loss_mlp": 1.03923225, "diversity_loss_mlp": 0.0, "epoch": 0.8145440554059253, "flos": 508366056960.0, "grad_norm": 0.07474822596726854, "language_loss": 0.82091659, "learning_rate": 8.751679837459963e-05, "loss": 0.83140111, "num_input_tokens_seen": 351387440, "router_z_loss_mlp": 0.09216309, "routerloss_mlp": 0.0, "step": 4234, "time_per_iteration": 2.595383405685425 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050121, "balance_loss_mlp": 1.04080522, "diversity_loss_mlp": 0.0, "epoch": 0.8147364370911889, "flos": 635032576512.0, "grad_norm": 0.05760879468903708, "language_loss": 0.86682582, "learning_rate": 8.734080042181181e-05, "loss": 0.87732702, "num_input_tokens_seen": 351464192, "router_z_loss_mlp": 0.09307861, "routerloss_mlp": 0.0, "step": 4235, "time_per_iteration": 2.8454620838165283 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050489, "balance_loss_mlp": 1.04129791, "diversity_loss_mlp": 0.0, "epoch": 0.8149288187764525, "flos": 422801482752.0, "grad_norm": 0.07072559835413951, "language_loss": 0.78216445, "learning_rate": 8.716496267753343e-05, "loss": 0.7926693, "num_input_tokens_seen": 351528016, "router_z_loss_mlp": 0.09191895, "routerloss_mlp": 0.0, "step": 4236, "time_per_iteration": 2.4742040634155273 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047208, "balance_loss_mlp": 1.03813028, "diversity_loss_mlp": 0.0, "epoch": 0.8151212004617161, "flos": 597444014592.0, "grad_norm": 0.06449709049791848, "language_loss": 0.81412882, "learning_rate": 8.698928521003097e-05, "loss": 0.82460093, "num_input_tokens_seen": 351601648, "router_z_loss_mlp": 0.09075928, "routerloss_mlp": 0.0, "step": 4237, "time_per_iteration": 2.7545273303985596 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01006046, "balance_loss_mlp": 1.00139654, "diversity_loss_mlp": 0.0, "epoch": 0.8153135821469796, "flos": 1479330915840.0, "grad_norm": 0.010587263465776719, "language_loss": 0.77852845, "learning_rate": 8.681376808750835e-05, "loss": 0.78858888, "num_input_tokens_seen": 351826720, "router_z_loss_mlp": 0.04638672, "routerloss_mlp": 0.0, "step": 4238, "time_per_iteration": 5.016268730163574 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047193, "balance_loss_mlp": 1.03776968, "diversity_loss_mlp": 0.0, "epoch": 0.8155059638322432, "flos": 437097669120.0, "grad_norm": 0.0684339838675198, "language_loss": 0.82887548, "learning_rate": 8.663841137810741e-05, "loss": 0.83934742, "num_input_tokens_seen": 351891760, "router_z_loss_mlp": 0.09411621, "routerloss_mlp": 0.0, "step": 4239, "time_per_iteration": 2.5211598873138428 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052042, "balance_loss_mlp": 1.04271388, "diversity_loss_mlp": 0.0, "epoch": 0.8156983455175068, "flos": 794390727168.0, "grad_norm": 0.06874840636234532, "language_loss": 0.85361314, "learning_rate": 8.646321514990763e-05, "loss": 0.8641336, "num_input_tokens_seen": 351977504, "router_z_loss_mlp": 0.09326172, "routerloss_mlp": 0.0, "step": 4240, "time_per_iteration": 3.083944797515869 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00785137, "balance_loss_mlp": 1.32642579, "diversity_loss_mlp": 0.22223642, "epoch": 0.8158907272027703, "flos": 685986029568.0, "grad_norm": 0.03037997104545499, "language_loss": 0.81663668, "learning_rate": 8.628817947092616e-05, "loss": 0.82448804, "num_input_tokens_seen": 352050176, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0108057, "step": 4241, "time_per_iteration": 2.849032163619995 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00796697, "balance_loss_mlp": 1.3468852, "diversity_loss_mlp": 0.22464219, "epoch": 0.8160831088880338, "flos": 487055213568.0, "grad_norm": 0.041459762566519655, "language_loss": 0.84508646, "learning_rate": 8.611330440911797e-05, "loss": 0.85305345, "num_input_tokens_seen": 352116848, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0109333, "step": 4242, "time_per_iteration": 2.6374778747558594 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010496, "balance_loss_mlp": 1.0404923, "diversity_loss_mlp": 0.0, "epoch": 0.8162754905732974, "flos": 464872172544.0, "grad_norm": 0.06813712019116032, "language_loss": 0.80444574, "learning_rate": 8.593859003237558e-05, "loss": 0.81494176, "num_input_tokens_seen": 352185056, "router_z_loss_mlp": 0.09112549, "routerloss_mlp": 0.0, "step": 4243, "time_per_iteration": 2.5741348266601562 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01005855, "balance_loss_mlp": 1.00120556, "diversity_loss_mlp": 0.0, "epoch": 0.816467872258561, "flos": 1239530522112.0, "grad_norm": 0.012183850402686274, "language_loss": 0.75285125, "learning_rate": 8.576403640852904e-05, "loss": 0.76290977, "num_input_tokens_seen": 352397648, "router_z_loss_mlp": 0.04638672, "routerloss_mlp": 0.0, "step": 4244, "time_per_iteration": 4.708779573440552 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0079579, "balance_loss_mlp": 1.34605587, "diversity_loss_mlp": 0.22397524, "epoch": 0.8166602539438246, "flos": 687169516032.0, "grad_norm": 0.030280251177676618, "language_loss": 0.86728865, "learning_rate": 8.558964360534615e-05, "loss": 0.87524652, "num_input_tokens_seen": 352478272, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01077495, "step": 4245, "time_per_iteration": 2.9368019104003906 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01006174, "balance_loss_mlp": 1.00154853, "diversity_loss_mlp": 0.0, "epoch": 0.8168526356290882, "flos": 1490520807936.0, "grad_norm": 0.013862139423476765, "language_loss": 0.72974741, "learning_rate": 8.541541169053219e-05, "loss": 0.73980916, "num_input_tokens_seen": 352707104, "router_z_loss_mlp": 0.04614258, "routerloss_mlp": 0.0, "step": 4246, "time_per_iteration": 4.941858291625977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0078277, "balance_loss_mlp": 1.31999934, "diversity_loss_mlp": 0.22372745, "epoch": 0.8170450173143516, "flos": 578201338368.0, "grad_norm": 0.027810419821976344, "language_loss": 0.84806323, "learning_rate": 8.524134073172984e-05, "loss": 0.85589087, "num_input_tokens_seen": 352779248, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01090694, "step": 4247, "time_per_iteration": 2.7287490367889404 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00791953, "balance_loss_mlp": 1.33846903, "diversity_loss_mlp": 0.22388186, "epoch": 0.8172373989996152, "flos": 571275514368.0, "grad_norm": 0.03087757735964202, "language_loss": 0.84696209, "learning_rate": 8.506743079651974e-05, "loss": 0.85488164, "num_input_tokens_seen": 352856784, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01077755, "step": 4248, "time_per_iteration": 2.7625157833099365 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053911, "balance_loss_mlp": 1.04469025, "diversity_loss_mlp": 0.0, "epoch": 0.8174297806848788, "flos": 528831866880.0, "grad_norm": 0.06506910983745173, "language_loss": 0.80918235, "learning_rate": 8.489368195241948e-05, "loss": 0.81972146, "num_input_tokens_seen": 352926496, "router_z_loss_mlp": 0.09222412, "routerloss_mlp": 0.0, "step": 4249, "time_per_iteration": 2.6258833408355713 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044558, "balance_loss_mlp": 1.03533733, "diversity_loss_mlp": 0.0, "epoch": 0.8176221623701424, "flos": 569108602368.0, "grad_norm": 0.06744676767794172, "language_loss": 0.78911942, "learning_rate": 8.47200942668846e-05, "loss": 0.79956502, "num_input_tokens_seen": 353005312, "router_z_loss_mlp": 0.09216309, "routerloss_mlp": 0.0, "step": 4250, "time_per_iteration": 2.7859880924224854 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048873, "balance_loss_mlp": 1.03986096, "diversity_loss_mlp": 0.0, "epoch": 0.8178145440554059, "flos": 656521459200.0, "grad_norm": 0.09007032647039148, "language_loss": 0.80543828, "learning_rate": 8.454666780730735e-05, "loss": 0.81592703, "num_input_tokens_seen": 353085120, "router_z_loss_mlp": 0.09008789, "routerloss_mlp": 0.0, "step": 4251, "time_per_iteration": 2.8444883823394775 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050772, "balance_loss_mlp": 1.04183125, "diversity_loss_mlp": 0.0, "epoch": 0.8180069257406695, "flos": 545924883456.0, "grad_norm": 0.06143293566062141, "language_loss": 0.87781107, "learning_rate": 8.437340264101828e-05, "loss": 0.88831878, "num_input_tokens_seen": 353160992, "router_z_loss_mlp": 0.08947754, "routerloss_mlp": 0.0, "step": 4252, "time_per_iteration": 2.710468053817749 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051813, "balance_loss_mlp": 1.04260981, "diversity_loss_mlp": 0.0, "epoch": 0.818199307425933, "flos": 619271350272.0, "grad_norm": 0.06730242930695572, "language_loss": 0.84812832, "learning_rate": 8.420029883528474e-05, "loss": 0.85864639, "num_input_tokens_seen": 353233328, "router_z_loss_mlp": 0.09197998, "routerloss_mlp": 0.0, "step": 4253, "time_per_iteration": 2.7251899242401123 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052007, "balance_loss_mlp": 1.04279804, "diversity_loss_mlp": 0.0, "epoch": 0.8183916891111966, "flos": 647618872320.0, "grad_norm": 0.07105593379415724, "language_loss": 0.77203315, "learning_rate": 8.402735645731157e-05, "loss": 0.7825532, "num_input_tokens_seen": 353310592, "router_z_loss_mlp": 0.09210205, "routerloss_mlp": 0.0, "step": 4254, "time_per_iteration": 2.8979763984680176 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046206, "balance_loss_mlp": 1.03733134, "diversity_loss_mlp": 0.0, "epoch": 0.8185840707964602, "flos": 499120247808.0, "grad_norm": 0.07494925573658785, "language_loss": 0.77925122, "learning_rate": 8.385457557424098e-05, "loss": 0.78971332, "num_input_tokens_seen": 353376544, "router_z_loss_mlp": 0.08886719, "routerloss_mlp": 0.0, "step": 4255, "time_per_iteration": 2.5896246433258057 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048093, "balance_loss_mlp": 1.03896809, "diversity_loss_mlp": 0.0, "epoch": 0.8187764524817237, "flos": 786229659648.0, "grad_norm": 0.05893979232495145, "language_loss": 0.79938138, "learning_rate": 8.368195625315251e-05, "loss": 0.80986238, "num_input_tokens_seen": 353461200, "router_z_loss_mlp": 0.09118652, "routerloss_mlp": 0.0, "step": 4256, "time_per_iteration": 3.068570852279663 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047062, "balance_loss_mlp": 1.03782368, "diversity_loss_mlp": 0.0, "epoch": 0.8189688341669873, "flos": 550710959616.0, "grad_norm": 0.07101674717136439, "language_loss": 0.80977142, "learning_rate": 8.350949856106283e-05, "loss": 0.82024205, "num_input_tokens_seen": 353538608, "router_z_loss_mlp": 0.09228516, "routerloss_mlp": 0.0, "step": 4257, "time_per_iteration": 2.7494471073150635 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01006173, "balance_loss_mlp": 1.00154781, "diversity_loss_mlp": 0.0, "epoch": 0.8191612158522509, "flos": 1351972435968.0, "grad_norm": 0.007149039484563577, "language_loss": 0.71149343, "learning_rate": 8.333720256492599e-05, "loss": 0.72155517, "num_input_tokens_seen": 353766960, "router_z_loss_mlp": 0.04614258, "routerloss_mlp": 0.0, "step": 4258, "time_per_iteration": 4.839837074279785 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043595, "balance_loss_mlp": 1.03455889, "diversity_loss_mlp": 0.0, "epoch": 0.8193535975375145, "flos": 544257211392.0, "grad_norm": 0.06534196989657123, "language_loss": 0.84030735, "learning_rate": 8.316506833163318e-05, "loss": 0.85074329, "num_input_tokens_seen": 353833552, "router_z_loss_mlp": 0.09051514, "routerloss_mlp": 0.0, "step": 4259, "time_per_iteration": 2.6422817707061768 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050217, "balance_loss_mlp": 1.04123449, "diversity_loss_mlp": 0.0, "epoch": 0.8195459792227779, "flos": 865733266944.0, "grad_norm": 0.05670368476253994, "language_loss": 0.85545492, "learning_rate": 8.299309592801297e-05, "loss": 0.86595714, "num_input_tokens_seen": 353915520, "router_z_loss_mlp": 0.08990479, "routerloss_mlp": 0.0, "step": 4260, "time_per_iteration": 3.125713586807251 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050233, "balance_loss_mlp": 1.04122066, "diversity_loss_mlp": 0.0, "epoch": 0.8197383609080415, "flos": 569293982208.0, "grad_norm": 0.06904116359736774, "language_loss": 0.81980395, "learning_rate": 8.282128542083101e-05, "loss": 0.83030629, "num_input_tokens_seen": 353992048, "router_z_loss_mlp": 0.09014893, "routerloss_mlp": 0.0, "step": 4261, "time_per_iteration": 2.76778507232666 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045572, "balance_loss_mlp": 1.03641081, "diversity_loss_mlp": 0.0, "epoch": 0.8199307425933051, "flos": 530813399040.0, "grad_norm": 0.058406154368980764, "language_loss": 0.85347754, "learning_rate": 8.264963687678978e-05, "loss": 0.86393321, "num_input_tokens_seen": 354064848, "router_z_loss_mlp": 0.09161377, "routerloss_mlp": 0.0, "step": 4262, "time_per_iteration": 2.628774404525757 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052152, "balance_loss_mlp": 1.04290724, "diversity_loss_mlp": 0.0, "epoch": 0.8201231242785687, "flos": 567070170624.0, "grad_norm": 0.09112328550849395, "language_loss": 0.85125005, "learning_rate": 8.247815036252921e-05, "loss": 0.86177158, "num_input_tokens_seen": 354138848, "router_z_loss_mlp": 0.09246826, "routerloss_mlp": 0.0, "step": 4263, "time_per_iteration": 2.7492353916168213 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048431, "balance_loss_mlp": 1.03952646, "diversity_loss_mlp": 0.0, "epoch": 0.8203155059638323, "flos": 1230505717248.0, "grad_norm": 0.06356232342525024, "language_loss": 0.82992971, "learning_rate": 8.230682594462652e-05, "loss": 0.84041393, "num_input_tokens_seen": 354227696, "router_z_loss_mlp": 0.08905029, "routerloss_mlp": 0.0, "step": 4264, "time_per_iteration": 3.54941725730896 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052131, "balance_loss_mlp": 1.04260004, "diversity_loss_mlp": 0.0, "epoch": 0.8205078876490958, "flos": 574198626816.0, "grad_norm": 0.061154055751469906, "language_loss": 0.79944229, "learning_rate": 8.213566368959558e-05, "loss": 0.80996358, "num_input_tokens_seen": 354298400, "router_z_loss_mlp": 0.09521484, "routerloss_mlp": 0.0, "step": 4265, "time_per_iteration": 2.677964210510254 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052685, "balance_loss_mlp": 1.04367328, "diversity_loss_mlp": 0.0, "epoch": 0.8207002693343594, "flos": 931400280576.0, "grad_norm": 0.06353811334374408, "language_loss": 0.78419554, "learning_rate": 8.196466366388744e-05, "loss": 0.79472238, "num_input_tokens_seen": 354385024, "router_z_loss_mlp": 0.09020996, "routerloss_mlp": 0.0, "step": 4266, "time_per_iteration": 3.203380823135376 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052882, "balance_loss_mlp": 1.04395366, "diversity_loss_mlp": 0.0, "epoch": 0.8208926510196229, "flos": 549571889664.0, "grad_norm": 0.06191713334502218, "language_loss": 0.80525327, "learning_rate": 8.179382593389029e-05, "loss": 0.81578207, "num_input_tokens_seen": 354456384, "router_z_loss_mlp": 0.08935547, "routerloss_mlp": 0.0, "step": 4267, "time_per_iteration": 2.6596202850341797 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056507, "balance_loss_mlp": 1.04715538, "diversity_loss_mlp": 0.0, "epoch": 0.8210850327048865, "flos": 648182352384.0, "grad_norm": 0.06008885513704129, "language_loss": 0.81976879, "learning_rate": 8.162315056592918e-05, "loss": 0.83033383, "num_input_tokens_seen": 354531296, "router_z_loss_mlp": 0.09344482, "routerloss_mlp": 0.0, "step": 4268, "time_per_iteration": 2.8304736614227295 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053453, "balance_loss_mlp": 1.04451835, "diversity_loss_mlp": 0.0, "epoch": 0.82127741439015, "flos": 601520878080.0, "grad_norm": 0.06523361113761998, "language_loss": 0.81845587, "learning_rate": 8.145263762626615e-05, "loss": 0.82899046, "num_input_tokens_seen": 354605680, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 4269, "time_per_iteration": 2.7376768589019775 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105318, "balance_loss_mlp": 1.04417932, "diversity_loss_mlp": 0.0, "epoch": 0.8214697960754136, "flos": 474831963648.0, "grad_norm": 0.07673767837283801, "language_loss": 0.83897698, "learning_rate": 8.128228718110015e-05, "loss": 0.84950882, "num_input_tokens_seen": 354678160, "router_z_loss_mlp": 0.09002686, "routerloss_mlp": 0.0, "step": 4270, "time_per_iteration": 2.6805686950683594 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051913, "balance_loss_mlp": 1.04284751, "diversity_loss_mlp": 0.0, "epoch": 0.8216621777606772, "flos": 903648172032.0, "grad_norm": 0.07279388279593675, "language_loss": 0.85111851, "learning_rate": 8.11120992965671e-05, "loss": 0.86163765, "num_input_tokens_seen": 354751024, "router_z_loss_mlp": 0.09063721, "routerloss_mlp": 0.0, "step": 4271, "time_per_iteration": 3.080000877380371 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00783822, "balance_loss_mlp": 1.32480633, "diversity_loss_mlp": 0.22162104, "epoch": 0.8218545594459408, "flos": 514461528576.0, "grad_norm": 0.033634037430315754, "language_loss": 0.82290757, "learning_rate": 8.094207403873998e-05, "loss": 0.83074582, "num_input_tokens_seen": 354819408, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01060844, "step": 4272, "time_per_iteration": 2.615750789642334 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049914, "balance_loss_mlp": 1.04102731, "diversity_loss_mlp": 0.0, "epoch": 0.8220469411312044, "flos": 494536803840.0, "grad_norm": 0.07856247677174821, "language_loss": 0.86208439, "learning_rate": 8.077221147362829e-05, "loss": 0.87258351, "num_input_tokens_seen": 354887376, "router_z_loss_mlp": 0.08898926, "routerloss_mlp": 0.0, "step": 4273, "time_per_iteration": 2.6263344287872314 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051099, "balance_loss_mlp": 1.04169989, "diversity_loss_mlp": 0.0, "epoch": 0.8222393228164678, "flos": 386433483264.0, "grad_norm": 0.08144467378809686, "language_loss": 0.89614367, "learning_rate": 8.060251166717835e-05, "loss": 0.90665472, "num_input_tokens_seen": 354948288, "router_z_loss_mlp": 0.09393311, "routerloss_mlp": 0.0, "step": 4274, "time_per_iteration": 2.400228500366211 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054991, "balance_loss_mlp": 1.04600263, "diversity_loss_mlp": 0.0, "epoch": 0.8224317045017314, "flos": 536590241280.0, "grad_norm": 0.06163444359601604, "language_loss": 0.86974454, "learning_rate": 8.043297468527383e-05, "loss": 0.88029444, "num_input_tokens_seen": 355016912, "router_z_loss_mlp": 0.08990479, "routerloss_mlp": 0.0, "step": 4275, "time_per_iteration": 2.6878175735473633 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048878, "balance_loss_mlp": 1.03988957, "diversity_loss_mlp": 0.0, "epoch": 0.822624086186995, "flos": 554899051008.0, "grad_norm": 0.07177776406534302, "language_loss": 0.82458985, "learning_rate": 8.02636005937346e-05, "loss": 0.83507866, "num_input_tokens_seen": 355085936, "router_z_loss_mlp": 0.08996582, "routerloss_mlp": 0.0, "step": 4276, "time_per_iteration": 2.666274070739746 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050704, "balance_loss_mlp": 1.04178667, "diversity_loss_mlp": 0.0, "epoch": 0.8228164678722586, "flos": 539579791872.0, "grad_norm": 0.06822688117582502, "language_loss": 0.79940748, "learning_rate": 8.009438945831771e-05, "loss": 0.80991459, "num_input_tokens_seen": 355161984, "router_z_loss_mlp": 0.08929443, "routerloss_mlp": 0.0, "step": 4277, "time_per_iteration": 2.6920108795166016 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052707, "balance_loss_mlp": 1.04362309, "diversity_loss_mlp": 0.0, "epoch": 0.8230088495575221, "flos": 473253124608.0, "grad_norm": 0.06798166655440095, "language_loss": 0.79305434, "learning_rate": 7.992534134471641e-05, "loss": 0.80358148, "num_input_tokens_seen": 355234544, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 4278, "time_per_iteration": 2.6593875885009766 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056035, "balance_loss_mlp": 1.04679036, "diversity_loss_mlp": 0.0, "epoch": 0.8232012312427857, "flos": 591672314880.0, "grad_norm": 0.07994138400827414, "language_loss": 0.82999951, "learning_rate": 7.975645631856127e-05, "loss": 0.84055984, "num_input_tokens_seen": 355302896, "router_z_loss_mlp": 0.09246826, "routerloss_mlp": 0.0, "step": 4279, "time_per_iteration": 2.6803600788116455 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105151, "balance_loss_mlp": 1.04226494, "diversity_loss_mlp": 0.0, "epoch": 0.8233936129280492, "flos": 572644380672.0, "grad_norm": 0.060738985338191206, "language_loss": 0.744928, "learning_rate": 7.958773444541916e-05, "loss": 0.7554431, "num_input_tokens_seen": 355377040, "router_z_loss_mlp": 0.09234619, "routerloss_mlp": 0.0, "step": 4280, "time_per_iteration": 2.7890987396240234 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055668, "balance_loss_mlp": 1.04667926, "diversity_loss_mlp": 0.0, "epoch": 0.8235859946133128, "flos": 731337735168.0, "grad_norm": 0.06641835359143249, "language_loss": 0.78285408, "learning_rate": 7.941917579079383e-05, "loss": 0.79341078, "num_input_tokens_seen": 355461616, "router_z_loss_mlp": 0.08984375, "routerloss_mlp": 0.0, "step": 4281, "time_per_iteration": 3.0231053829193115 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052951, "balance_loss_mlp": 1.04405797, "diversity_loss_mlp": 0.0, "epoch": 0.8237783762985764, "flos": 570314483712.0, "grad_norm": 0.07232954234982779, "language_loss": 0.81364781, "learning_rate": 7.92507804201253e-05, "loss": 0.82417727, "num_input_tokens_seen": 355532480, "router_z_loss_mlp": 0.08892822, "routerloss_mlp": 0.0, "step": 4282, "time_per_iteration": 2.702601909637451 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01009495, "balance_loss_mlp": 1.00503695, "diversity_loss_mlp": 0.0, "epoch": 0.8239707579838399, "flos": 1466232897024.0, "grad_norm": 0.005580683595342396, "language_loss": 0.75297678, "learning_rate": 7.908254839879092e-05, "loss": 0.76307166, "num_input_tokens_seen": 355768752, "router_z_loss_mlp": 0.04467773, "routerloss_mlp": 0.0, "step": 4283, "time_per_iteration": 4.935715675354004 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057513, "balance_loss_mlp": 1.04841709, "diversity_loss_mlp": 0.0, "epoch": 0.8241631396691035, "flos": 467313297408.0, "grad_norm": 0.0758894988729268, "language_loss": 0.81082892, "learning_rate": 7.89144797921037e-05, "loss": 0.82140398, "num_input_tokens_seen": 355838800, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 4284, "time_per_iteration": 2.6500790119171143 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01010322, "balance_loss_mlp": 1.00588739, "diversity_loss_mlp": 0.0, "epoch": 0.8243555213543671, "flos": 1539426290688.0, "grad_norm": 0.005340107036422925, "language_loss": 0.77934271, "learning_rate": 7.874657466531388e-05, "loss": 0.78944594, "num_input_tokens_seen": 356069280, "router_z_loss_mlp": 0.04443359, "routerloss_mlp": 0.0, "step": 4285, "time_per_iteration": 4.93043065071106 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055758, "balance_loss_mlp": 1.04675198, "diversity_loss_mlp": 0.0, "epoch": 0.8245479030396307, "flos": 797429836800.0, "grad_norm": 0.052404155401405805, "language_loss": 0.82728308, "learning_rate": 7.85788330836078e-05, "loss": 0.83784062, "num_input_tokens_seen": 356164528, "router_z_loss_mlp": 0.09008789, "routerloss_mlp": 0.0, "step": 4286, "time_per_iteration": 3.1566803455352783 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054082, "balance_loss_mlp": 1.04502165, "diversity_loss_mlp": 0.0, "epoch": 0.8247402847248941, "flos": 646114185216.0, "grad_norm": 0.07426299244547702, "language_loss": 0.76636487, "learning_rate": 7.841125511210878e-05, "loss": 0.77690566, "num_input_tokens_seen": 356243600, "router_z_loss_mlp": 0.09063721, "routerloss_mlp": 0.0, "step": 4287, "time_per_iteration": 2.894404888153076 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054163, "balance_loss_mlp": 1.04488242, "diversity_loss_mlp": 0.0, "epoch": 0.8249326664101577, "flos": 604421595648.0, "grad_norm": 0.05641463912536871, "language_loss": 0.79555058, "learning_rate": 7.824384081587637e-05, "loss": 0.8060922, "num_input_tokens_seen": 356320320, "router_z_loss_mlp": 0.09277344, "routerloss_mlp": 0.0, "step": 4288, "time_per_iteration": 2.8229329586029053 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058243, "balance_loss_mlp": 1.04930818, "diversity_loss_mlp": 0.0, "epoch": 0.8251250480954213, "flos": 824369218560.0, "grad_norm": 0.0762203665991507, "language_loss": 0.86487937, "learning_rate": 7.807659025990637e-05, "loss": 0.87546182, "num_input_tokens_seen": 356406928, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 4289, "time_per_iteration": 3.1116397380828857 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051154, "balance_loss_mlp": 1.04234433, "diversity_loss_mlp": 0.0, "epoch": 0.8253174297806849, "flos": 757382897664.0, "grad_norm": 0.0740808728635397, "language_loss": 0.78204668, "learning_rate": 7.790950350913112e-05, "loss": 0.79255825, "num_input_tokens_seen": 356481456, "router_z_loss_mlp": 0.0881958, "routerloss_mlp": 0.0, "step": 4290, "time_per_iteration": 2.9050347805023193 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054866, "balance_loss_mlp": 1.04616976, "diversity_loss_mlp": 0.0, "epoch": 0.8255098114659485, "flos": 794469648384.0, "grad_norm": 0.058080618005571384, "language_loss": 0.87400663, "learning_rate": 7.774258062841971e-05, "loss": 0.88455528, "num_input_tokens_seen": 356568736, "router_z_loss_mlp": 0.08709717, "routerloss_mlp": 0.0, "step": 4291, "time_per_iteration": 3.1467742919921875 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052707, "balance_loss_mlp": 1.04383206, "diversity_loss_mlp": 0.0, "epoch": 0.825702193151212, "flos": 710417475072.0, "grad_norm": 0.06448799909112234, "language_loss": 0.77267563, "learning_rate": 7.757582168257731e-05, "loss": 0.78320277, "num_input_tokens_seen": 356643328, "router_z_loss_mlp": 0.08886719, "routerloss_mlp": 0.0, "step": 4292, "time_per_iteration": 2.875955581665039 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105942, "balance_loss_mlp": 1.05067623, "diversity_loss_mlp": 0.0, "epoch": 0.8258945748364755, "flos": 683394029568.0, "grad_norm": 0.06489065655526868, "language_loss": 0.80734456, "learning_rate": 7.740922673634537e-05, "loss": 0.8179388, "num_input_tokens_seen": 356723824, "router_z_loss_mlp": 0.08758545, "routerloss_mlp": 0.0, "step": 4293, "time_per_iteration": 2.906735420227051 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105726, "balance_loss_mlp": 1.04794431, "diversity_loss_mlp": 0.0, "epoch": 0.8260869565217391, "flos": 594563120640.0, "grad_norm": 0.06785179357058724, "language_loss": 0.78951818, "learning_rate": 7.724279585440186e-05, "loss": 0.80009079, "num_input_tokens_seen": 356796512, "router_z_loss_mlp": 0.09313965, "routerloss_mlp": 0.0, "step": 4294, "time_per_iteration": 2.721102237701416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051983, "balance_loss_mlp": 1.04291677, "diversity_loss_mlp": 0.0, "epoch": 0.8262793382070027, "flos": 651480993792.0, "grad_norm": 0.07073253675532468, "language_loss": 0.8505556, "learning_rate": 7.707652910136098e-05, "loss": 0.8610754, "num_input_tokens_seen": 356868624, "router_z_loss_mlp": 0.09069824, "routerloss_mlp": 0.0, "step": 4295, "time_per_iteration": 2.7751898765563965 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055812, "balance_loss_mlp": 1.04672778, "diversity_loss_mlp": 0.0, "epoch": 0.8264717198922663, "flos": 538922709504.0, "grad_norm": 0.06741164173780789, "language_loss": 0.84659898, "learning_rate": 7.691042654177315e-05, "loss": 0.85715711, "num_input_tokens_seen": 356934368, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 4296, "time_per_iteration": 2.6647472381591797 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056393, "balance_loss_mlp": 1.04746425, "diversity_loss_mlp": 0.0, "epoch": 0.8266641015775298, "flos": 538949873664.0, "grad_norm": 0.07582259364872852, "language_loss": 0.75999844, "learning_rate": 7.674448824012514e-05, "loss": 0.77056229, "num_input_tokens_seen": 357005536, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 4297, "time_per_iteration": 2.6833221912384033 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053612, "balance_loss_mlp": 1.04438508, "diversity_loss_mlp": 0.0, "epoch": 0.8268564832627934, "flos": 585361728000.0, "grad_norm": 0.05929184332183984, "language_loss": 0.83883959, "learning_rate": 7.657871426083979e-05, "loss": 0.84937572, "num_input_tokens_seen": 357082160, "router_z_loss_mlp": 0.09228516, "routerloss_mlp": 0.0, "step": 4298, "time_per_iteration": 2.8329238891601562 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053687, "balance_loss_mlp": 1.04474664, "diversity_loss_mlp": 0.0, "epoch": 0.827048864948057, "flos": 430661173248.0, "grad_norm": 0.07448007019964706, "language_loss": 0.84225285, "learning_rate": 7.641310466827667e-05, "loss": 0.85278976, "num_input_tokens_seen": 357146928, "router_z_loss_mlp": 0.08947754, "routerloss_mlp": 0.0, "step": 4299, "time_per_iteration": 2.489332675933838 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049289, "balance_loss_mlp": 1.04037237, "diversity_loss_mlp": 0.0, "epoch": 0.8272412466333205, "flos": 1388430761472.0, "grad_norm": 0.06599892876771768, "language_loss": 0.85128617, "learning_rate": 7.624765952673069e-05, "loss": 0.86177909, "num_input_tokens_seen": 357236768, "router_z_loss_mlp": 0.08917236, "routerloss_mlp": 0.0, "step": 4300, "time_per_iteration": 3.732990026473999 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055631, "balance_loss_mlp": 1.04661894, "diversity_loss_mlp": 0.0, "epoch": 0.827433628318584, "flos": 538230749184.0, "grad_norm": 0.05906795179451105, "language_loss": 0.82889211, "learning_rate": 7.608237890043335e-05, "loss": 0.83944845, "num_input_tokens_seen": 357307568, "router_z_loss_mlp": 0.09014893, "routerloss_mlp": 0.0, "step": 4301, "time_per_iteration": 2.690711259841919 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048067, "balance_loss_mlp": 1.03897715, "diversity_loss_mlp": 0.0, "epoch": 0.8276260100038476, "flos": 730734981120.0, "grad_norm": 0.07258594610710227, "language_loss": 0.77361107, "learning_rate": 7.59172628535526e-05, "loss": 0.78409171, "num_input_tokens_seen": 357387712, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 4302, "time_per_iteration": 2.9701120853424072 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00788744, "balance_loss_mlp": 1.3319999, "diversity_loss_mlp": 0.22346261, "epoch": 0.8278183916891112, "flos": 871102273536.0, "grad_norm": 0.027743371165779296, "language_loss": 0.82558441, "learning_rate": 7.575231145019196e-05, "loss": 0.83347189, "num_input_tokens_seen": 357473360, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01101248, "step": 4303, "time_per_iteration": 3.223346471786499 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052756, "balance_loss_mlp": 1.04391634, "diversity_loss_mlp": 0.0, "epoch": 0.8280107733743748, "flos": 594543297024.0, "grad_norm": 0.05962542188798652, "language_loss": 0.7781111, "learning_rate": 7.558752475439134e-05, "loss": 0.78863871, "num_input_tokens_seen": 357548432, "router_z_loss_mlp": 0.08843994, "routerloss_mlp": 0.0, "step": 4304, "time_per_iteration": 2.7994863986968994 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051008, "balance_loss_mlp": 1.04218018, "diversity_loss_mlp": 0.0, "epoch": 0.8282031550596384, "flos": 768607667712.0, "grad_norm": 0.07052691004217361, "language_loss": 0.84562683, "learning_rate": 7.542290283012653e-05, "loss": 0.85613692, "num_input_tokens_seen": 357625968, "router_z_loss_mlp": 0.08837891, "routerloss_mlp": 0.0, "step": 4305, "time_per_iteration": 3.0267395973205566 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051296, "balance_loss_mlp": 1.04208159, "diversity_loss_mlp": 0.0, "epoch": 0.8283955367449019, "flos": 696108805632.0, "grad_norm": 0.07942922848471844, "language_loss": 0.78335333, "learning_rate": 7.525844574130947e-05, "loss": 0.79386634, "num_input_tokens_seen": 357705824, "router_z_loss_mlp": 0.09210205, "routerloss_mlp": 0.0, "step": 4306, "time_per_iteration": 2.914696455001831 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049867, "balance_loss_mlp": 1.0407536, "diversity_loss_mlp": 0.0, "epoch": 0.8285879184301654, "flos": 660630256128.0, "grad_norm": 0.08577922080448468, "language_loss": 0.82953119, "learning_rate": 7.509415355178806e-05, "loss": 0.8400299, "num_input_tokens_seen": 357787040, "router_z_loss_mlp": 0.09124756, "routerloss_mlp": 0.0, "step": 4307, "time_per_iteration": 2.9498178958892822 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00788913, "balance_loss_mlp": 1.33115017, "diversity_loss_mlp": 0.22477263, "epoch": 0.828780300115429, "flos": 558709042176.0, "grad_norm": 0.04309088247538252, "language_loss": 0.77926069, "learning_rate": 7.493002632534618e-05, "loss": 0.78714979, "num_input_tokens_seen": 357856960, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01095133, "step": 4308, "time_per_iteration": 2.7063913345336914 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050662, "balance_loss_mlp": 1.04154897, "diversity_loss_mlp": 0.0, "epoch": 0.8289726818006926, "flos": 830963930112.0, "grad_norm": 0.05899046117627297, "language_loss": 0.81765443, "learning_rate": 7.476606412570352e-05, "loss": 0.828161, "num_input_tokens_seen": 357937760, "router_z_loss_mlp": 0.09112549, "routerloss_mlp": 0.0, "step": 4309, "time_per_iteration": 3.0521981716156006 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053534, "balance_loss_mlp": 1.04459929, "diversity_loss_mlp": 0.0, "epoch": 0.8291650634859561, "flos": 732289227264.0, "grad_norm": 0.07518852690871787, "language_loss": 0.80517173, "learning_rate": 7.460226701651624e-05, "loss": 0.81570709, "num_input_tokens_seen": 358012480, "router_z_loss_mlp": 0.08947754, "routerloss_mlp": 0.0, "step": 4310, "time_per_iteration": 2.904289722442627 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055947, "balance_loss_mlp": 1.04662442, "diversity_loss_mlp": 0.0, "epoch": 0.8293574451712197, "flos": 860910114816.0, "grad_norm": 0.06212685924060065, "language_loss": 0.81412387, "learning_rate": 7.443863506137566e-05, "loss": 0.82468331, "num_input_tokens_seen": 358100720, "router_z_loss_mlp": 0.09313965, "routerloss_mlp": 0.0, "step": 4311, "time_per_iteration": 3.203298807144165 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052126, "balance_loss_mlp": 1.04322684, "diversity_loss_mlp": 0.0, "epoch": 0.8295498268564833, "flos": 495156810240.0, "grad_norm": 0.05391272281173969, "language_loss": 0.81940407, "learning_rate": 7.427516832380948e-05, "loss": 0.8299253, "num_input_tokens_seen": 358180496, "router_z_loss_mlp": 0.08905029, "routerloss_mlp": 0.0, "step": 4312, "time_per_iteration": 2.8845975399017334 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055692, "balance_loss_mlp": 1.04694164, "diversity_loss_mlp": 0.0, "epoch": 0.8297422085417469, "flos": 554471391744.0, "grad_norm": 0.05500480744199572, "language_loss": 0.77808565, "learning_rate": 7.4111866867281e-05, "loss": 0.78864259, "num_input_tokens_seen": 358261104, "router_z_loss_mlp": 0.08758545, "routerloss_mlp": 0.0, "step": 4313, "time_per_iteration": 2.7781200408935547 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048602, "balance_loss_mlp": 1.03975666, "diversity_loss_mlp": 0.0, "epoch": 0.8299345902270104, "flos": 1247497417728.0, "grad_norm": 0.06268776190670762, "language_loss": 0.77513206, "learning_rate": 7.39487307551896e-05, "loss": 0.78561807, "num_input_tokens_seen": 358356368, "router_z_loss_mlp": 0.08843994, "routerloss_mlp": 0.0, "step": 4314, "time_per_iteration": 3.6484732627868652 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060785, "balance_loss_mlp": 1.05197561, "diversity_loss_mlp": 0.0, "epoch": 0.8301269719122739, "flos": 585260411904.0, "grad_norm": 0.07094165320870974, "language_loss": 0.83007073, "learning_rate": 7.378576005087034e-05, "loss": 0.84067863, "num_input_tokens_seen": 358429104, "router_z_loss_mlp": 0.08825684, "routerloss_mlp": 0.0, "step": 4315, "time_per_iteration": 2.7556705474853516 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105563, "balance_loss_mlp": 1.04686821, "diversity_loss_mlp": 0.0, "epoch": 0.8303193535975375, "flos": 509732352000.0, "grad_norm": 0.06645426228125094, "language_loss": 0.84888268, "learning_rate": 7.362295481759412e-05, "loss": 0.85943896, "num_input_tokens_seen": 358501344, "router_z_loss_mlp": 0.08776855, "routerloss_mlp": 0.0, "step": 4316, "time_per_iteration": 2.6553759574890137 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00786621, "balance_loss_mlp": 1.32643843, "diversity_loss_mlp": 0.22519124, "epoch": 0.8305117352828011, "flos": 580652375040.0, "grad_norm": 0.03189628781024831, "language_loss": 0.83680773, "learning_rate": 7.346031511856722e-05, "loss": 0.84467387, "num_input_tokens_seen": 358575584, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01080582, "step": 4317, "time_per_iteration": 2.742246150970459 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054275, "balance_loss_mlp": 1.04532266, "diversity_loss_mlp": 0.0, "epoch": 0.8307041169680647, "flos": 481626736128.0, "grad_norm": 0.06852217711760565, "language_loss": 0.7890569, "learning_rate": 7.329784101693232e-05, "loss": 0.79959965, "num_input_tokens_seen": 358644304, "router_z_loss_mlp": 0.08959961, "routerloss_mlp": 0.0, "step": 4318, "time_per_iteration": 2.601116418838501 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105841, "balance_loss_mlp": 1.04927838, "diversity_loss_mlp": 0.0, "epoch": 0.8308964986533282, "flos": 624605852160.0, "grad_norm": 0.06935977491556748, "language_loss": 0.83060843, "learning_rate": 7.313553257576727e-05, "loss": 0.84119254, "num_input_tokens_seen": 358712384, "router_z_loss_mlp": 0.09136963, "routerloss_mlp": 0.0, "step": 4319, "time_per_iteration": 2.7160871028900146 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052858, "balance_loss_mlp": 1.04382229, "diversity_loss_mlp": 0.0, "epoch": 0.8310888803385917, "flos": 827319495168.0, "grad_norm": 0.07045309902078044, "language_loss": 0.78631043, "learning_rate": 7.297338985808589e-05, "loss": 0.79683906, "num_input_tokens_seen": 358789264, "router_z_loss_mlp": 0.09039307, "routerloss_mlp": 0.0, "step": 4320, "time_per_iteration": 3.009129762649536 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059595, "balance_loss_mlp": 1.05061913, "diversity_loss_mlp": 0.0, "epoch": 0.8312812620238553, "flos": 583743241728.0, "grad_norm": 0.06816415290870351, "language_loss": 0.81865102, "learning_rate": 7.281141292683746e-05, "loss": 0.829247, "num_input_tokens_seen": 358868976, "router_z_loss_mlp": 0.08984375, "routerloss_mlp": 0.0, "step": 4321, "time_per_iteration": 2.814836025238037 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056811, "balance_loss_mlp": 1.04793024, "diversity_loss_mlp": 0.0, "epoch": 0.8314736437091189, "flos": 1115605052928.0, "grad_norm": 0.06950401316575304, "language_loss": 0.7471621, "learning_rate": 7.26496018449071e-05, "loss": 0.75773025, "num_input_tokens_seen": 358953600, "router_z_loss_mlp": 0.08886719, "routerloss_mlp": 0.0, "step": 4322, "time_per_iteration": 3.438296318054199 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057458, "balance_loss_mlp": 1.0484755, "diversity_loss_mlp": 0.0, "epoch": 0.8316660253943825, "flos": 517547625984.0, "grad_norm": 0.07376809791811713, "language_loss": 0.82077682, "learning_rate": 7.248795667511543e-05, "loss": 0.83135134, "num_input_tokens_seen": 359028768, "router_z_loss_mlp": 0.08978271, "routerloss_mlp": 0.0, "step": 4323, "time_per_iteration": 2.7750163078308105 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054403, "balance_loss_mlp": 1.04560554, "diversity_loss_mlp": 0.0, "epoch": 0.831858407079646, "flos": 795329736192.0, "grad_norm": 0.07472428991139068, "language_loss": 0.77946472, "learning_rate": 7.232647748021864e-05, "loss": 0.79000878, "num_input_tokens_seen": 359116208, "router_z_loss_mlp": 0.08807373, "routerloss_mlp": 0.0, "step": 4324, "time_per_iteration": 3.035860776901245 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01058014, "balance_loss_mlp": 1.04919243, "diversity_loss_mlp": 0.0, "epoch": 0.8320507887649096, "flos": 549967242240.0, "grad_norm": 0.06856699827771942, "language_loss": 0.83216256, "learning_rate": 7.216516432290843e-05, "loss": 0.84274268, "num_input_tokens_seen": 359189552, "router_z_loss_mlp": 0.08825684, "routerloss_mlp": 0.0, "step": 4325, "time_per_iteration": 2.705737352371216 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057993, "balance_loss_mlp": 1.04915345, "diversity_loss_mlp": 0.0, "epoch": 0.8322431704501732, "flos": 479398155264.0, "grad_norm": 0.07351613065944015, "language_loss": 0.82007957, "learning_rate": 7.20040172658123e-05, "loss": 0.83065945, "num_input_tokens_seen": 359253008, "router_z_loss_mlp": 0.08837891, "routerloss_mlp": 0.0, "step": 4326, "time_per_iteration": 2.601170539855957 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060086, "balance_loss_mlp": 1.0512104, "diversity_loss_mlp": 0.0, "epoch": 0.8324355521354367, "flos": 572434407936.0, "grad_norm": 0.05702554279595623, "language_loss": 0.85418373, "learning_rate": 7.184303637149308e-05, "loss": 0.86478466, "num_input_tokens_seen": 359326368, "router_z_loss_mlp": 0.08874512, "routerloss_mlp": 0.0, "step": 4327, "time_per_iteration": 2.6739983558654785 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057295, "balance_loss_mlp": 1.04846764, "diversity_loss_mlp": 0.0, "epoch": 0.8326279338207002, "flos": 503454071808.0, "grad_norm": 0.06350176662838333, "language_loss": 0.82565081, "learning_rate": 7.168222170244888e-05, "loss": 0.83622372, "num_input_tokens_seen": 359394192, "router_z_loss_mlp": 0.08825684, "routerloss_mlp": 0.0, "step": 4328, "time_per_iteration": 2.608927011489868 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055632, "balance_loss_mlp": 1.04681087, "diversity_loss_mlp": 0.0, "epoch": 0.8328203155059638, "flos": 605743474176.0, "grad_norm": 0.06140661393609168, "language_loss": 0.81182075, "learning_rate": 7.152157332111364e-05, "loss": 0.82237709, "num_input_tokens_seen": 359476016, "router_z_loss_mlp": 0.08831787, "routerloss_mlp": 0.0, "step": 4329, "time_per_iteration": 2.9149293899536133 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055768, "balance_loss_mlp": 1.04682159, "diversity_loss_mlp": 0.0, "epoch": 0.8330126971912274, "flos": 697798872576.0, "grad_norm": 0.07439273272708623, "language_loss": 0.8576234, "learning_rate": 7.136109128985663e-05, "loss": 0.86818105, "num_input_tokens_seen": 359554048, "router_z_loss_mlp": 0.08953857, "routerloss_mlp": 0.0, "step": 4330, "time_per_iteration": 2.9639134407043457 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105965, "balance_loss_mlp": 1.05070877, "diversity_loss_mlp": 0.0, "epoch": 0.833205078876491, "flos": 494042706432.0, "grad_norm": 0.08290776170171969, "language_loss": 0.86890334, "learning_rate": 7.120077567098249e-05, "loss": 0.87949985, "num_input_tokens_seen": 359621440, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 4331, "time_per_iteration": 2.6148195266723633 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054578, "balance_loss_mlp": 1.04560781, "diversity_loss_mlp": 0.0, "epoch": 0.8333974605617546, "flos": 482812793856.0, "grad_norm": 0.057322207358884096, "language_loss": 0.82625836, "learning_rate": 7.104062652673115e-05, "loss": 0.83680409, "num_input_tokens_seen": 359690320, "router_z_loss_mlp": 0.08984375, "routerloss_mlp": 0.0, "step": 4332, "time_per_iteration": 2.621798515319824 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056568, "balance_loss_mlp": 1.0477283, "diversity_loss_mlp": 0.0, "epoch": 0.833589842247018, "flos": 686821151232.0, "grad_norm": 0.07570063772280167, "language_loss": 0.82964915, "learning_rate": 7.088064391927818e-05, "loss": 0.84021485, "num_input_tokens_seen": 359759888, "router_z_loss_mlp": 0.08843994, "routerloss_mlp": 0.0, "step": 4333, "time_per_iteration": 2.837819814682007 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053173, "balance_loss_mlp": 1.04428554, "diversity_loss_mlp": 0.0, "epoch": 0.8337822239322816, "flos": 881739343872.0, "grad_norm": 0.06974463300031715, "language_loss": 0.83023667, "learning_rate": 7.072082791073419e-05, "loss": 0.8407684, "num_input_tokens_seen": 359836544, "router_z_loss_mlp": 0.08886719, "routerloss_mlp": 0.0, "step": 4334, "time_per_iteration": 3.1047897338867188 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054952, "balance_loss_mlp": 1.04588628, "diversity_loss_mlp": 0.0, "epoch": 0.8339746056175452, "flos": 497183132160.0, "grad_norm": 0.07461604540726756, "language_loss": 0.82598537, "learning_rate": 7.056117856314531e-05, "loss": 0.83653492, "num_input_tokens_seen": 359903024, "router_z_loss_mlp": 0.09063721, "routerloss_mlp": 0.0, "step": 4335, "time_per_iteration": 2.5917162895202637 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105528, "balance_loss_mlp": 1.04616058, "diversity_loss_mlp": 0.0, "epoch": 0.8341669873028088, "flos": 510495892992.0, "grad_norm": 0.07051755558905955, "language_loss": 0.8628878, "learning_rate": 7.040169593849289e-05, "loss": 0.87344062, "num_input_tokens_seen": 359971200, "router_z_loss_mlp": 0.09118652, "routerloss_mlp": 0.0, "step": 4336, "time_per_iteration": 2.6134135723114014 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050741, "balance_loss_mlp": 1.04197288, "diversity_loss_mlp": 0.0, "epoch": 0.8343593689880723, "flos": 692321209344.0, "grad_norm": 0.06598640893887409, "language_loss": 0.83991468, "learning_rate": 7.024238009869366e-05, "loss": 0.85042214, "num_input_tokens_seen": 360042560, "router_z_loss_mlp": 0.08770752, "routerloss_mlp": 0.0, "step": 4337, "time_per_iteration": 2.7903592586517334 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052579, "balance_loss_mlp": 1.04391873, "diversity_loss_mlp": 0.0, "epoch": 0.8345517506733359, "flos": 552408367104.0, "grad_norm": 0.0663044915688964, "language_loss": 0.7816447, "learning_rate": 7.008323110559956e-05, "loss": 0.79217046, "num_input_tokens_seen": 360118048, "router_z_loss_mlp": 0.08673096, "routerloss_mlp": 0.0, "step": 4338, "time_per_iteration": 2.7299916744232178 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053332, "balance_loss_mlp": 1.04413533, "diversity_loss_mlp": 0.0, "epoch": 0.8347441323585995, "flos": 592052613120.0, "grad_norm": 0.06355289445146371, "language_loss": 0.76546603, "learning_rate": 6.992424902099754e-05, "loss": 0.77599931, "num_input_tokens_seen": 360192528, "router_z_loss_mlp": 0.09185791, "routerloss_mlp": 0.0, "step": 4339, "time_per_iteration": 2.8064498901367188 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052893, "balance_loss_mlp": 1.04425037, "diversity_loss_mlp": 0.0, "epoch": 0.834936514043863, "flos": 614917702656.0, "grad_norm": 0.061799613244502456, "language_loss": 0.84427285, "learning_rate": 6.976543390660983e-05, "loss": 0.85480177, "num_input_tokens_seen": 360266880, "router_z_loss_mlp": 0.08648682, "routerloss_mlp": 0.0, "step": 4340, "time_per_iteration": 2.7731611728668213 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105023, "balance_loss_mlp": 1.04137301, "diversity_loss_mlp": 0.0, "epoch": 0.8351288957291266, "flos": 467844470784.0, "grad_norm": 0.15350718356465945, "language_loss": 0.79499578, "learning_rate": 6.960678582409424e-05, "loss": 0.80549812, "num_input_tokens_seen": 360336336, "router_z_loss_mlp": 0.08862305, "routerloss_mlp": 0.0, "step": 4341, "time_per_iteration": 2.6016902923583984 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052884, "balance_loss_mlp": 1.04431295, "diversity_loss_mlp": 0.0, "epoch": 0.8353212774143901, "flos": 509319747072.0, "grad_norm": 0.07564737297123257, "language_loss": 0.78984159, "learning_rate": 6.944830483504328e-05, "loss": 0.80037045, "num_input_tokens_seen": 360409776, "router_z_loss_mlp": 0.08581543, "routerloss_mlp": 0.0, "step": 4342, "time_per_iteration": 2.670459747314453 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049647, "balance_loss_mlp": 1.04070663, "diversity_loss_mlp": 0.0, "epoch": 0.8355136590996537, "flos": 687784753152.0, "grad_norm": 0.06668235677339521, "language_loss": 0.8060447, "learning_rate": 6.928999100098483e-05, "loss": 0.81654119, "num_input_tokens_seen": 360486800, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 4343, "time_per_iteration": 2.817136287689209 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00783572, "balance_loss_mlp": 1.31915021, "diversity_loss_mlp": 0.22572948, "epoch": 0.8357060407849173, "flos": 984409417728.0, "grad_norm": 0.032919488551848924, "language_loss": 0.84127021, "learning_rate": 6.913184438338138e-05, "loss": 0.84910595, "num_input_tokens_seen": 360568624, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01113241, "step": 4344, "time_per_iteration": 3.2518675327301025 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01059144, "balance_loss_mlp": 1.05024457, "diversity_loss_mlp": 0.0, "epoch": 0.8358984224701809, "flos": 843026393088.0, "grad_norm": 0.06270529003473267, "language_loss": 0.85050792, "learning_rate": 6.89738650436313e-05, "loss": 0.86109936, "num_input_tokens_seen": 360652384, "router_z_loss_mlp": 0.08905029, "routerloss_mlp": 0.0, "step": 4345, "time_per_iteration": 3.1636109352111816 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053656, "balance_loss_mlp": 1.04487062, "diversity_loss_mlp": 0.0, "epoch": 0.8360908041554445, "flos": 626239019520.0, "grad_norm": 0.07260078506489727, "language_loss": 0.82210159, "learning_rate": 6.881605304306748e-05, "loss": 0.83263814, "num_input_tokens_seen": 360723200, "router_z_loss_mlp": 0.08795166, "routerloss_mlp": 0.0, "step": 4346, "time_per_iteration": 2.8204703330993652 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050108, "balance_loss_mlp": 1.04092288, "diversity_loss_mlp": 0.0, "epoch": 0.8362831858407079, "flos": 576068931072.0, "grad_norm": 0.061944149403073474, "language_loss": 0.8502146, "learning_rate": 6.865840844295796e-05, "loss": 0.86071575, "num_input_tokens_seen": 360798240, "router_z_loss_mlp": 0.09179688, "routerloss_mlp": 0.0, "step": 4347, "time_per_iteration": 2.805941343307495 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053763, "balance_loss_mlp": 1.04459023, "diversity_loss_mlp": 0.0, "epoch": 0.8364755675259715, "flos": 833783155200.0, "grad_norm": 0.0772733121075158, "language_loss": 0.8092171, "learning_rate": 6.850093130450569e-05, "loss": 0.81975472, "num_input_tokens_seen": 360873552, "router_z_loss_mlp": 0.09173584, "routerloss_mlp": 0.0, "step": 4348, "time_per_iteration": 3.040851593017578 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00790162, "balance_loss_mlp": 1.33250082, "diversity_loss_mlp": 0.22602889, "epoch": 0.8366679492112351, "flos": 582480834048.0, "grad_norm": 0.039903517211963106, "language_loss": 0.86440182, "learning_rate": 6.834362168884912e-05, "loss": 0.87230343, "num_input_tokens_seen": 360940800, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0108971, "step": 4349, "time_per_iteration": 2.699540615081787 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054884, "balance_loss_mlp": 1.04582453, "diversity_loss_mlp": 0.0, "epoch": 0.8368603308964987, "flos": 611722948608.0, "grad_norm": 0.07332657660036589, "language_loss": 0.87533635, "learning_rate": 6.818647965706076e-05, "loss": 0.88588518, "num_input_tokens_seen": 361014368, "router_z_loss_mlp": 0.09063721, "routerloss_mlp": 0.0, "step": 4350, "time_per_iteration": 2.7678165435791016 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052928, "balance_loss_mlp": 1.04408848, "diversity_loss_mlp": 0.0, "epoch": 0.8370527125817622, "flos": 507264062976.0, "grad_norm": 0.06629049094152589, "language_loss": 0.85621446, "learning_rate": 6.802950527014884e-05, "loss": 0.86674374, "num_input_tokens_seen": 361087184, "router_z_loss_mlp": 0.08850098, "routerloss_mlp": 0.0, "step": 4351, "time_per_iteration": 2.737682819366455 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045769, "balance_loss_mlp": 1.03676879, "diversity_loss_mlp": 0.0, "epoch": 0.8372450942670258, "flos": 770952619008.0, "grad_norm": 0.07766225400345093, "language_loss": 0.82484055, "learning_rate": 6.787269858905603e-05, "loss": 0.8352983, "num_input_tokens_seen": 361160720, "router_z_loss_mlp": 0.09002686, "routerloss_mlp": 0.0, "step": 4352, "time_per_iteration": 2.9142751693725586 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048952, "balance_loss_mlp": 1.04007125, "diversity_loss_mlp": 0.0, "epoch": 0.8374374759522893, "flos": 579276168192.0, "grad_norm": 0.06438247248872511, "language_loss": 0.85065448, "learning_rate": 6.771605967466033e-05, "loss": 0.86114407, "num_input_tokens_seen": 361234432, "router_z_loss_mlp": 0.08892822, "routerloss_mlp": 0.0, "step": 4353, "time_per_iteration": 2.6874396800994873 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048044, "balance_loss_mlp": 1.0389719, "diversity_loss_mlp": 0.0, "epoch": 0.8376298576375529, "flos": 788129699328.0, "grad_norm": 0.07663124345564373, "language_loss": 0.82635599, "learning_rate": 6.755958858777434e-05, "loss": 0.83683646, "num_input_tokens_seen": 361309376, "router_z_loss_mlp": 0.09082031, "routerloss_mlp": 0.0, "step": 4354, "time_per_iteration": 2.998286724090576 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052737, "balance_loss_mlp": 1.04317021, "diversity_loss_mlp": 0.0, "epoch": 0.8378222393228165, "flos": 577613265408.0, "grad_norm": 0.07233016182516484, "language_loss": 0.80633909, "learning_rate": 6.74032853891452e-05, "loss": 0.8168664, "num_input_tokens_seen": 361386768, "router_z_loss_mlp": 0.09564209, "routerloss_mlp": 0.0, "step": 4355, "time_per_iteration": 2.75176739692688 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046082, "balance_loss_mlp": 1.03711188, "diversity_loss_mlp": 0.0, "epoch": 0.83801462100808, "flos": 480865766400.0, "grad_norm": 0.06437396666642163, "language_loss": 0.82113147, "learning_rate": 6.724715013945548e-05, "loss": 0.83159232, "num_input_tokens_seen": 361456704, "router_z_loss_mlp": 0.08978271, "routerloss_mlp": 0.0, "step": 4356, "time_per_iteration": 2.638768196105957 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050411, "balance_loss_mlp": 1.04145241, "diversity_loss_mlp": 0.0, "epoch": 0.8382070026933436, "flos": 550817044992.0, "grad_norm": 0.06364273403340714, "language_loss": 0.8922165, "learning_rate": 6.709118289932226e-05, "loss": 0.90272063, "num_input_tokens_seen": 361533648, "router_z_loss_mlp": 0.08959961, "routerloss_mlp": 0.0, "step": 4357, "time_per_iteration": 2.78487491607666 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051515, "balance_loss_mlp": 1.04247308, "diversity_loss_mlp": 0.0, "epoch": 0.8383993843786072, "flos": 624968898048.0, "grad_norm": 0.08356541609520973, "language_loss": 0.82212794, "learning_rate": 6.693538372929725e-05, "loss": 0.83264303, "num_input_tokens_seen": 361614256, "router_z_loss_mlp": 0.09051514, "routerloss_mlp": 0.0, "step": 4358, "time_per_iteration": 2.9017884731292725 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00786956, "balance_loss_mlp": 1.32808125, "diversity_loss_mlp": 0.22438851, "epoch": 0.8385917660638708, "flos": 491169153024.0, "grad_norm": 0.03328062669176706, "language_loss": 0.86377019, "learning_rate": 6.677975268986719e-05, "loss": 0.87163973, "num_input_tokens_seen": 361679008, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01072117, "step": 4359, "time_per_iteration": 2.57958984375 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047883, "balance_loss_mlp": 1.0387392, "diversity_loss_mlp": 0.0, "epoch": 0.8387841477491342, "flos": 466900692480.0, "grad_norm": 0.07170710125962251, "language_loss": 0.87394094, "learning_rate": 6.662428984145336e-05, "loss": 0.8844198, "num_input_tokens_seen": 361747600, "router_z_loss_mlp": 0.09143066, "routerloss_mlp": 0.0, "step": 4360, "time_per_iteration": 2.5944197177886963 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01016166, "balance_loss_mlp": 1.01177895, "diversity_loss_mlp": 0.0, "epoch": 0.8389765294343978, "flos": 1564188475392.0, "grad_norm": 0.01396369957588317, "language_loss": 0.71780187, "learning_rate": 6.646899524441175e-05, "loss": 0.72796351, "num_input_tokens_seen": 361983104, "router_z_loss_mlp": 0.04394531, "routerloss_mlp": 0.0, "step": 4361, "time_per_iteration": 5.049343109130859 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049104, "balance_loss_mlp": 1.04028869, "diversity_loss_mlp": 0.0, "epoch": 0.8391689111196614, "flos": 602160708096.0, "grad_norm": 0.0657328713955244, "language_loss": 0.82911998, "learning_rate": 6.631386895903308e-05, "loss": 0.83961105, "num_input_tokens_seen": 362065824, "router_z_loss_mlp": 0.08831787, "routerloss_mlp": 0.0, "step": 4362, "time_per_iteration": 2.857707977294922 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049668, "balance_loss_mlp": 1.04045308, "diversity_loss_mlp": 0.0, "epoch": 0.839361292804925, "flos": 443047408128.0, "grad_norm": 0.07766308356740377, "language_loss": 0.80444038, "learning_rate": 6.615891104554261e-05, "loss": 0.81493711, "num_input_tokens_seen": 362128240, "router_z_loss_mlp": 0.09216309, "routerloss_mlp": 0.0, "step": 4363, "time_per_iteration": 2.481901168823242 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046513, "balance_loss_mlp": 1.0369525, "diversity_loss_mlp": 0.0, "epoch": 0.8395536744901886, "flos": 594167768064.0, "grad_norm": 0.061496061316517255, "language_loss": 0.82737863, "learning_rate": 6.600412156410057e-05, "loss": 0.83784378, "num_input_tokens_seen": 362198256, "router_z_loss_mlp": 0.09552002, "routerloss_mlp": 0.0, "step": 4364, "time_per_iteration": 2.7074997425079346 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048671, "balance_loss_mlp": 1.03946805, "diversity_loss_mlp": 0.0, "epoch": 0.8397460561754521, "flos": 889836171264.0, "grad_norm": 0.067014192244174, "language_loss": 0.84650993, "learning_rate": 6.58495005748016e-05, "loss": 0.85699666, "num_input_tokens_seen": 362279792, "router_z_loss_mlp": 0.09197998, "routerloss_mlp": 0.0, "step": 4365, "time_per_iteration": 3.1557445526123047 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045295, "balance_loss_mlp": 1.03640795, "diversity_loss_mlp": 0.0, "epoch": 0.8399384378607156, "flos": 553503020544.0, "grad_norm": 0.0631575802857794, "language_loss": 0.89196813, "learning_rate": 6.569504813767463e-05, "loss": 0.90242112, "num_input_tokens_seen": 362351712, "router_z_loss_mlp": 0.08886719, "routerloss_mlp": 0.0, "step": 4366, "time_per_iteration": 2.624469757080078 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046761, "balance_loss_mlp": 1.03753984, "diversity_loss_mlp": 0.0, "epoch": 0.8401308195459792, "flos": 518923832832.0, "grad_norm": 0.06347741472269025, "language_loss": 0.83584821, "learning_rate": 6.554076431268341e-05, "loss": 0.8463158, "num_input_tokens_seen": 362423424, "router_z_loss_mlp": 0.09228516, "routerloss_mlp": 0.0, "step": 4367, "time_per_iteration": 2.6431565284729004 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049148, "balance_loss_mlp": 1.04021287, "diversity_loss_mlp": 0.0, "epoch": 0.8403232012312428, "flos": 684933221376.0, "grad_norm": 0.07076442779164972, "language_loss": 0.80955088, "learning_rate": 6.538664915972648e-05, "loss": 0.82004237, "num_input_tokens_seen": 362514704, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 4368, "time_per_iteration": 3.018554449081421 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00773368, "balance_loss_mlp": 1.30118096, "diversity_loss_mlp": 0.22479768, "epoch": 0.8405155829165063, "flos": 577672736256.0, "grad_norm": 0.03439452063807504, "language_loss": 0.77776653, "learning_rate": 6.523270273863652e-05, "loss": 0.78550017, "num_input_tokens_seen": 362581296, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01037853, "step": 4369, "time_per_iteration": 2.6944448947906494 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045809, "balance_loss_mlp": 1.03648067, "diversity_loss_mlp": 0.0, "epoch": 0.8407079646017699, "flos": 456627041280.0, "grad_norm": 0.1193689802326749, "language_loss": 0.87956655, "learning_rate": 6.507892510918079e-05, "loss": 0.8900246, "num_input_tokens_seen": 362648304, "router_z_loss_mlp": 0.09326172, "routerloss_mlp": 0.0, "step": 4370, "time_per_iteration": 2.529339551925659 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047708, "balance_loss_mlp": 1.03855264, "diversity_loss_mlp": 0.0, "epoch": 0.8409003462870335, "flos": 534917426688.0, "grad_norm": 0.07411757925982031, "language_loss": 0.81849647, "learning_rate": 6.492531633106114e-05, "loss": 0.82897353, "num_input_tokens_seen": 362721264, "router_z_loss_mlp": 0.09161377, "routerloss_mlp": 0.0, "step": 4371, "time_per_iteration": 2.776374578475952 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050613, "balance_loss_mlp": 1.04111791, "diversity_loss_mlp": 0.0, "epoch": 0.8410927279722971, "flos": 556759443456.0, "grad_norm": 0.08018635739985482, "language_loss": 0.77876925, "learning_rate": 6.477187646391374e-05, "loss": 0.78927541, "num_input_tokens_seen": 362795312, "router_z_loss_mlp": 0.09484863, "routerloss_mlp": 0.0, "step": 4372, "time_per_iteration": 2.7516069412231445 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01008359, "balance_loss_mlp": 1.00390017, "diversity_loss_mlp": 0.0, "epoch": 0.8412851096575606, "flos": 1549754270208.0, "grad_norm": 0.00952058425700796, "language_loss": 0.77679121, "learning_rate": 6.461860556730925e-05, "loss": 0.78687477, "num_input_tokens_seen": 363026272, "router_z_loss_mlp": 0.04467773, "routerloss_mlp": 0.0, "step": 4373, "time_per_iteration": 4.912792682647705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048789, "balance_loss_mlp": 1.0395565, "diversity_loss_mlp": 0.0, "epoch": 0.8414774913428241, "flos": 552042749952.0, "grad_norm": 0.07245552666854996, "language_loss": 0.78958535, "learning_rate": 6.446550370075271e-05, "loss": 0.80007321, "num_input_tokens_seen": 363098384, "router_z_loss_mlp": 0.09228516, "routerloss_mlp": 0.0, "step": 4374, "time_per_iteration": 2.711447238922119 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046156, "balance_loss_mlp": 1.03688145, "diversity_loss_mlp": 0.0, "epoch": 0.8416698730280877, "flos": 573015140352.0, "grad_norm": 0.07770698856431457, "language_loss": 0.77577722, "learning_rate": 6.431257092368336e-05, "loss": 0.78623879, "num_input_tokens_seen": 363170960, "router_z_loss_mlp": 0.0927124, "routerloss_mlp": 0.0, "step": 4375, "time_per_iteration": 2.694774627685547 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050017, "balance_loss_mlp": 1.04059398, "diversity_loss_mlp": 0.0, "epoch": 0.8418622547133513, "flos": 758731940352.0, "grad_norm": 0.11734230107546348, "language_loss": 0.80035317, "learning_rate": 6.415980729547543e-05, "loss": 0.81085336, "num_input_tokens_seen": 363242000, "router_z_loss_mlp": 0.09411621, "routerloss_mlp": 0.0, "step": 4376, "time_per_iteration": 2.918545961380005 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049017, "balance_loss_mlp": 1.03976655, "diversity_loss_mlp": 0.0, "epoch": 0.8420546363986149, "flos": 1074156940800.0, "grad_norm": 0.07794527811003633, "language_loss": 0.72769749, "learning_rate": 6.40072128754366e-05, "loss": 0.73818767, "num_input_tokens_seen": 363340288, "router_z_loss_mlp": 0.0925293, "routerloss_mlp": 0.0, "step": 4377, "time_per_iteration": 3.4151737689971924 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050973, "balance_loss_mlp": 1.04171598, "diversity_loss_mlp": 0.0, "epoch": 0.8422470180838784, "flos": 525908754432.0, "grad_norm": 0.0675536673804059, "language_loss": 0.82617545, "learning_rate": 6.385478772280933e-05, "loss": 0.83668518, "num_input_tokens_seen": 363416208, "router_z_loss_mlp": 0.0925293, "routerloss_mlp": 0.0, "step": 4378, "time_per_iteration": 2.749711036682129 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048915, "balance_loss_mlp": 1.03964031, "diversity_loss_mlp": 0.0, "epoch": 0.842439399769142, "flos": 600834060288.0, "grad_norm": 0.06567054296588401, "language_loss": 0.82044506, "learning_rate": 6.370253189677038e-05, "loss": 0.83093417, "num_input_tokens_seen": 363492864, "router_z_loss_mlp": 0.0927124, "routerloss_mlp": 0.0, "step": 4379, "time_per_iteration": 2.761420488357544 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049874, "balance_loss_mlp": 1.04072499, "diversity_loss_mlp": 0.0, "epoch": 0.8426317814544055, "flos": 552222987264.0, "grad_norm": 0.06119198131713492, "language_loss": 0.86507058, "learning_rate": 6.355044545643073e-05, "loss": 0.87556934, "num_input_tokens_seen": 363572000, "router_z_loss_mlp": 0.0914917, "routerloss_mlp": 0.0, "step": 4380, "time_per_iteration": 2.816401720046997 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049119, "balance_loss_mlp": 1.04015481, "diversity_loss_mlp": 0.0, "epoch": 0.8428241631396691, "flos": 678832980480.0, "grad_norm": 0.08611471083111012, "language_loss": 0.77840042, "learning_rate": 6.33985284608356e-05, "loss": 0.78889161, "num_input_tokens_seen": 363646480, "router_z_loss_mlp": 0.08959961, "routerloss_mlp": 0.0, "step": 4381, "time_per_iteration": 2.8088033199310303 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048589, "balance_loss_mlp": 1.03958273, "diversity_loss_mlp": 0.0, "epoch": 0.8430165448249327, "flos": 753730748928.0, "grad_norm": 0.06180211012921075, "language_loss": 0.79696667, "learning_rate": 6.324678096896435e-05, "loss": 0.80745256, "num_input_tokens_seen": 363737552, "router_z_loss_mlp": 0.09014893, "routerloss_mlp": 0.0, "step": 4382, "time_per_iteration": 3.0762522220611572 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049388, "balance_loss_mlp": 1.04026818, "diversity_loss_mlp": 0.0, "epoch": 0.8432089265101962, "flos": 699140574720.0, "grad_norm": 0.07097197774761282, "language_loss": 0.80925977, "learning_rate": 6.30952030397306e-05, "loss": 0.81975365, "num_input_tokens_seen": 363816016, "router_z_loss_mlp": 0.09118652, "routerloss_mlp": 0.0, "step": 4383, "time_per_iteration": 2.8958194255828857 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047141, "balance_loss_mlp": 1.03793836, "diversity_loss_mlp": 0.0, "epoch": 0.8434013081954598, "flos": 485767839744.0, "grad_norm": 0.08175099554660337, "language_loss": 0.84386265, "learning_rate": 6.294379473198208e-05, "loss": 0.854334, "num_input_tokens_seen": 363888192, "router_z_loss_mlp": 0.09204102, "routerloss_mlp": 0.0, "step": 4384, "time_per_iteration": 2.6954331398010254 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049059, "balance_loss_mlp": 1.03982067, "diversity_loss_mlp": 0.0, "epoch": 0.8435936898807234, "flos": 520623811584.0, "grad_norm": 0.0940310335311775, "language_loss": 0.85289472, "learning_rate": 6.279255610450068e-05, "loss": 0.86338532, "num_input_tokens_seen": 363953904, "router_z_loss_mlp": 0.09240723, "routerloss_mlp": 0.0, "step": 4385, "time_per_iteration": 2.6073288917541504 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052192, "balance_loss_mlp": 1.0430907, "diversity_loss_mlp": 0.0, "epoch": 0.843786071565987, "flos": 785945534976.0, "grad_norm": 0.06584361059499325, "language_loss": 0.80478346, "learning_rate": 6.264148721600254e-05, "loss": 0.81530541, "num_input_tokens_seen": 364031552, "router_z_loss_mlp": 0.09106445, "routerloss_mlp": 0.0, "step": 4386, "time_per_iteration": 2.9602465629577637 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01003223, "balance_loss_mlp": 0.99876487, "diversity_loss_mlp": 0.0, "epoch": 0.8439784532512504, "flos": 1446278436864.0, "grad_norm": 0.01332354164942413, "language_loss": 0.75836509, "learning_rate": 6.24905881251378e-05, "loss": 0.76839739, "num_input_tokens_seen": 364256480, "router_z_loss_mlp": 0.04467773, "routerloss_mlp": 0.0, "step": 4387, "time_per_iteration": 4.922089099884033 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051887, "balance_loss_mlp": 1.0426724, "diversity_loss_mlp": 0.0, "epoch": 0.844170834936514, "flos": 708700243968.0, "grad_norm": 0.08625525862164317, "language_loss": 0.82786238, "learning_rate": 6.23398588904906e-05, "loss": 0.83838129, "num_input_tokens_seen": 364329696, "router_z_loss_mlp": 0.09216309, "routerloss_mlp": 0.0, "step": 4388, "time_per_iteration": 2.8626224994659424 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049632, "balance_loss_mlp": 1.04066157, "diversity_loss_mlp": 0.0, "epoch": 0.8443632166217776, "flos": 483428030976.0, "grad_norm": 0.06592449787759593, "language_loss": 0.79633564, "learning_rate": 6.218929957057922e-05, "loss": 0.80683196, "num_input_tokens_seen": 364400944, "router_z_loss_mlp": 0.08978271, "routerloss_mlp": 0.0, "step": 4389, "time_per_iteration": 2.681319236755371 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053609, "balance_loss_mlp": 1.04455543, "diversity_loss_mlp": 0.0, "epoch": 0.8445555983070412, "flos": 678694588416.0, "grad_norm": 0.06375633990495472, "language_loss": 0.80234212, "learning_rate": 6.2038910223856e-05, "loss": 0.81287819, "num_input_tokens_seen": 364475744, "router_z_loss_mlp": 0.09057617, "routerloss_mlp": 0.0, "step": 4390, "time_per_iteration": 2.8914427757263184 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051123, "balance_loss_mlp": 1.0421586, "diversity_loss_mlp": 0.0, "epoch": 0.8447479799923048, "flos": 741485477376.0, "grad_norm": 0.07030854249904422, "language_loss": 0.74476206, "learning_rate": 6.18886909087073e-05, "loss": 0.75527334, "num_input_tokens_seen": 364557248, "router_z_loss_mlp": 0.08966064, "routerloss_mlp": 0.0, "step": 4391, "time_per_iteration": 2.983142375946045 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056789, "balance_loss_mlp": 1.04770541, "diversity_loss_mlp": 0.0, "epoch": 0.8449403616775683, "flos": 953306537472.0, "grad_norm": 0.06360843007002392, "language_loss": 0.80354917, "learning_rate": 6.173864168345344e-05, "loss": 0.81411707, "num_input_tokens_seen": 364647856, "router_z_loss_mlp": 0.09082031, "routerloss_mlp": 0.0, "step": 4392, "time_per_iteration": 3.266145706176758 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105708, "balance_loss_mlp": 1.04769254, "diversity_loss_mlp": 0.0, "epoch": 0.8451327433628318, "flos": 657363921408.0, "grad_norm": 0.0822485878003235, "language_loss": 0.72267312, "learning_rate": 6.158876260634871e-05, "loss": 0.73324394, "num_input_tokens_seen": 364728848, "router_z_loss_mlp": 0.09375, "routerloss_mlp": 0.0, "step": 4393, "time_per_iteration": 2.8685081005096436 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104874, "balance_loss_mlp": 1.03982329, "diversity_loss_mlp": 0.0, "epoch": 0.8453251250480954, "flos": 446113681920.0, "grad_norm": 0.07697573681675166, "language_loss": 0.83679235, "learning_rate": 6.143905373558112e-05, "loss": 0.84727973, "num_input_tokens_seen": 364794032, "router_z_loss_mlp": 0.0892334, "routerloss_mlp": 0.0, "step": 4394, "time_per_iteration": 2.533674478530884 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053449, "balance_loss_mlp": 1.04453218, "diversity_loss_mlp": 0.0, "epoch": 0.845517506733359, "flos": 542767205376.0, "grad_norm": 0.07537571823528784, "language_loss": 0.7097168, "learning_rate": 6.128951512927305e-05, "loss": 0.72025126, "num_input_tokens_seen": 364868624, "router_z_loss_mlp": 0.0892334, "routerloss_mlp": 0.0, "step": 4395, "time_per_iteration": 2.6876683235168457 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051353, "balance_loss_mlp": 1.04228103, "diversity_loss_mlp": 0.0, "epoch": 0.8457098884186226, "flos": 502440910848.0, "grad_norm": 0.08282627197829308, "language_loss": 0.84426546, "learning_rate": 6.114014684548046e-05, "loss": 0.85477906, "num_input_tokens_seen": 364938208, "router_z_loss_mlp": 0.09069824, "routerloss_mlp": 0.0, "step": 4396, "time_per_iteration": 2.6650242805480957 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050163, "balance_loss_mlp": 1.0413115, "diversity_loss_mlp": 0.0, "epoch": 0.8459022701038861, "flos": 448893259776.0, "grad_norm": 0.15468816830135243, "language_loss": 0.79700321, "learning_rate": 6.099094894219326e-05, "loss": 0.80750489, "num_input_tokens_seen": 365009440, "router_z_loss_mlp": 0.08856201, "routerloss_mlp": 0.0, "step": 4397, "time_per_iteration": 2.7101781368255615 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044589, "balance_loss_mlp": 1.03563631, "diversity_loss_mlp": 0.0, "epoch": 0.8460946517891497, "flos": 743178115584.0, "grad_norm": 0.05893126536703995, "language_loss": 0.75071192, "learning_rate": 6.0841921477335194e-05, "loss": 0.76115775, "num_input_tokens_seen": 365085904, "router_z_loss_mlp": 0.08959961, "routerloss_mlp": 0.0, "step": 4398, "time_per_iteration": 2.9596059322357178 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104686, "balance_loss_mlp": 1.03793144, "diversity_loss_mlp": 0.0, "epoch": 0.8462870334744133, "flos": 553216324608.0, "grad_norm": 0.0659677770319019, "language_loss": 0.80090201, "learning_rate": 6.069306450876389e-05, "loss": 0.81137055, "num_input_tokens_seen": 365163600, "router_z_loss_mlp": 0.08929443, "routerloss_mlp": 0.0, "step": 4399, "time_per_iteration": 2.750497341156006 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01008801, "balance_loss_mlp": 1.0044378, "diversity_loss_mlp": 0.0, "epoch": 0.8464794151596768, "flos": 1564877864448.0, "grad_norm": 0.013995388355349315, "language_loss": 0.81708568, "learning_rate": 6.054437809427071e-05, "loss": 0.82717371, "num_input_tokens_seen": 365384528, "router_z_loss_mlp": 0.04370117, "routerloss_mlp": 0.0, "step": 4400, "time_per_iteration": 4.847966432571411 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044991, "balance_loss_mlp": 1.03586566, "diversity_loss_mlp": 0.0, "epoch": 0.8466717968449403, "flos": 550197038592.0, "grad_norm": 0.060817981350280916, "language_loss": 0.79790008, "learning_rate": 6.039586229158084e-05, "loss": 0.80835003, "num_input_tokens_seen": 365453760, "router_z_loss_mlp": 0.09124756, "routerloss_mlp": 0.0, "step": 4401, "time_per_iteration": 2.668105125427246 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045073, "balance_loss_mlp": 1.03601933, "diversity_loss_mlp": 0.0, "epoch": 0.8468641785302039, "flos": 551919038976.0, "grad_norm": 0.07199778737497019, "language_loss": 0.84602404, "learning_rate": 6.024751715835314e-05, "loss": 0.85647476, "num_input_tokens_seen": 365532416, "router_z_loss_mlp": 0.09057617, "routerloss_mlp": 0.0, "step": 4402, "time_per_iteration": 2.8081796169281006 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044356, "balance_loss_mlp": 1.03515351, "diversity_loss_mlp": 0.0, "epoch": 0.8470565602154675, "flos": 572671544832.0, "grad_norm": 0.10925067279097164, "language_loss": 0.87193465, "learning_rate": 6.009934275218049e-05, "loss": 0.88237822, "num_input_tokens_seen": 365603776, "router_z_loss_mlp": 0.09197998, "routerloss_mlp": 0.0, "step": 4403, "time_per_iteration": 2.7070863246917725 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047531, "balance_loss_mlp": 1.03842974, "diversity_loss_mlp": 0.0, "epoch": 0.8472489419007311, "flos": 472833179136.0, "grad_norm": 0.08568709869316025, "language_loss": 0.84353817, "learning_rate": 5.995133913058936e-05, "loss": 0.85401344, "num_input_tokens_seen": 365670432, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 4404, "time_per_iteration": 2.5401875972747803 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044764, "balance_loss_mlp": 1.03592503, "diversity_loss_mlp": 0.0, "epoch": 0.8474413235859947, "flos": 798020481024.0, "grad_norm": 0.0709686000036253, "language_loss": 0.79758859, "learning_rate": 5.980350635103954e-05, "loss": 0.80803621, "num_input_tokens_seen": 365741584, "router_z_loss_mlp": 0.08843994, "routerloss_mlp": 0.0, "step": 4405, "time_per_iteration": 2.9586398601531982 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047629, "balance_loss_mlp": 1.03862858, "diversity_loss_mlp": 0.0, "epoch": 0.8476337052712581, "flos": 502379241984.0, "grad_norm": 0.0758173793957083, "language_loss": 0.80622578, "learning_rate": 5.9655844470924866e-05, "loss": 0.81670201, "num_input_tokens_seen": 365805344, "router_z_loss_mlp": 0.09008789, "routerloss_mlp": 0.0, "step": 4406, "time_per_iteration": 2.5468907356262207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104799, "balance_loss_mlp": 1.03891182, "diversity_loss_mlp": 0.0, "epoch": 0.8478260869565217, "flos": 931971101184.0, "grad_norm": 0.08716014432574012, "language_loss": 0.83022702, "learning_rate": 5.9508353547573e-05, "loss": 0.84070694, "num_input_tokens_seen": 365890976, "router_z_loss_mlp": 0.09075928, "routerloss_mlp": 0.0, "step": 4407, "time_per_iteration": 3.180832862854004 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046222, "balance_loss_mlp": 1.03713799, "diversity_loss_mlp": 0.0, "epoch": 0.8480184686417853, "flos": 708811471872.0, "grad_norm": 0.06912642288251827, "language_loss": 0.80724686, "learning_rate": 5.9361033638244855e-05, "loss": 0.81770915, "num_input_tokens_seen": 365968912, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 4408, "time_per_iteration": 2.8790152072906494 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045571, "balance_loss_mlp": 1.03665996, "diversity_loss_mlp": 0.0, "epoch": 0.8482108503270489, "flos": 614440857600.0, "grad_norm": 0.06430935054215667, "language_loss": 0.82201052, "learning_rate": 5.9213884800135066e-05, "loss": 0.83246624, "num_input_tokens_seen": 366047680, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 4409, "time_per_iteration": 2.8187878131866455 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048254, "balance_loss_mlp": 1.03908658, "diversity_loss_mlp": 0.0, "epoch": 0.8484032320123124, "flos": 531016031232.0, "grad_norm": 0.07260617685747814, "language_loss": 0.82220393, "learning_rate": 5.906690709037194e-05, "loss": 0.83268642, "num_input_tokens_seen": 366118720, "router_z_loss_mlp": 0.0916748, "routerloss_mlp": 0.0, "step": 4410, "time_per_iteration": 2.618715286254883 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01011478, "balance_loss_mlp": 1.00716281, "diversity_loss_mlp": 0.0, "epoch": 0.848595613697576, "flos": 1542776315904.0, "grad_norm": 0.010800011769390029, "language_loss": 0.76296914, "learning_rate": 5.892010056601726e-05, "loss": 0.77308393, "num_input_tokens_seen": 366346928, "router_z_loss_mlp": 0.04321289, "routerloss_mlp": 0.0, "step": 4411, "time_per_iteration": 4.929163455963135 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00779672, "balance_loss_mlp": 1.31286287, "diversity_loss_mlp": 0.22471759, "epoch": 0.8487879953828396, "flos": 677342974464.0, "grad_norm": 0.03344280518316992, "language_loss": 0.74134266, "learning_rate": 5.877346528406635e-05, "loss": 0.74913931, "num_input_tokens_seen": 366422848, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01088216, "step": 4412, "time_per_iteration": 2.887648582458496 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046513, "balance_loss_mlp": 1.03763819, "diversity_loss_mlp": 0.0, "epoch": 0.8489803770681031, "flos": 503673956352.0, "grad_norm": 0.07759361608874747, "language_loss": 0.79911488, "learning_rate": 5.8627001301448105e-05, "loss": 0.80958003, "num_input_tokens_seen": 366492016, "router_z_loss_mlp": 0.08880615, "routerloss_mlp": 0.0, "step": 4413, "time_per_iteration": 2.634019613265991 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051803, "balance_loss_mlp": 1.04298139, "diversity_loss_mlp": 0.0, "epoch": 0.8491727587533667, "flos": 563186027520.0, "grad_norm": 0.06257116408066361, "language_loss": 0.77061796, "learning_rate": 5.84807086750247e-05, "loss": 0.78113604, "num_input_tokens_seen": 366566400, "router_z_loss_mlp": 0.08825684, "routerloss_mlp": 0.0, "step": 4414, "time_per_iteration": 2.739079236984253 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045842, "balance_loss_mlp": 1.03654408, "diversity_loss_mlp": 0.0, "epoch": 0.8493651404386302, "flos": 459784719360.0, "grad_norm": 0.08252582476840821, "language_loss": 0.779769, "learning_rate": 5.833458746159243e-05, "loss": 0.79022747, "num_input_tokens_seen": 366634016, "router_z_loss_mlp": 0.09295654, "routerloss_mlp": 0.0, "step": 4415, "time_per_iteration": 2.550938367843628 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00790044, "balance_loss_mlp": 1.33385825, "diversity_loss_mlp": 0.22484043, "epoch": 0.8495575221238938, "flos": 461170838016.0, "grad_norm": 0.03510190626754167, "language_loss": 0.82241035, "learning_rate": 5.818863771788013e-05, "loss": 0.83031082, "num_input_tokens_seen": 366704384, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01069522, "step": 4416, "time_per_iteration": 2.629504442214966 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052312, "balance_loss_mlp": 1.04326987, "diversity_loss_mlp": 0.0, "epoch": 0.8497499038091574, "flos": 870712063488.0, "grad_norm": 0.06455923563838298, "language_loss": 0.81343329, "learning_rate": 5.8042859500550604e-05, "loss": 0.82395649, "num_input_tokens_seen": 366785456, "router_z_loss_mlp": 0.0904541, "routerloss_mlp": 0.0, "step": 4417, "time_per_iteration": 3.1615569591522217 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00774549, "balance_loss_mlp": 1.30053818, "diversity_loss_mlp": 0.22707056, "epoch": 0.849942285494421, "flos": 779600443392.0, "grad_norm": 0.03325715859037055, "language_loss": 0.78278667, "learning_rate": 5.789725286620018e-05, "loss": 0.79053217, "num_input_tokens_seen": 366862848, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01074457, "step": 4418, "time_per_iteration": 3.063164234161377 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105333, "balance_loss_mlp": 1.04439521, "diversity_loss_mlp": 0.0, "epoch": 0.8501346671796844, "flos": 513816556032.0, "grad_norm": 0.06460470640159872, "language_loss": 0.84812874, "learning_rate": 5.775181787135819e-05, "loss": 0.85866207, "num_input_tokens_seen": 366934800, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 4419, "time_per_iteration": 2.694917678833008 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052007, "balance_loss_mlp": 1.043239, "diversity_loss_mlp": 0.0, "epoch": 0.850327048864948, "flos": 621445602816.0, "grad_norm": 0.11539940060888441, "language_loss": 0.83957243, "learning_rate": 5.76065545724877e-05, "loss": 0.85009253, "num_input_tokens_seen": 367015152, "router_z_loss_mlp": 0.08776855, "routerloss_mlp": 0.0, "step": 4420, "time_per_iteration": 2.8541665077209473 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053783, "balance_loss_mlp": 1.04484272, "diversity_loss_mlp": 0.0, "epoch": 0.8505194305502116, "flos": 774221524992.0, "grad_norm": 0.06628978561515504, "language_loss": 0.79903436, "learning_rate": 5.746146302598454e-05, "loss": 0.80957222, "num_input_tokens_seen": 367092192, "router_z_loss_mlp": 0.08953857, "routerloss_mlp": 0.0, "step": 4421, "time_per_iteration": 3.027402877807617 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057337, "balance_loss_mlp": 1.04840255, "diversity_loss_mlp": 0.0, "epoch": 0.8507118122354752, "flos": 465257613312.0, "grad_norm": 0.065145609650453, "language_loss": 0.86839747, "learning_rate": 5.731654328817859e-05, "loss": 0.87897086, "num_input_tokens_seen": 367159744, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 4422, "time_per_iteration": 2.608247756958008 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01060117, "balance_loss_mlp": 1.05109882, "diversity_loss_mlp": 0.0, "epoch": 0.8509041939207388, "flos": 534413417472.0, "grad_norm": 0.06673581896538218, "language_loss": 0.84873575, "learning_rate": 5.717179541533257e-05, "loss": 0.85933691, "num_input_tokens_seen": 367226384, "router_z_loss_mlp": 0.09020996, "routerloss_mlp": 0.0, "step": 4423, "time_per_iteration": 2.640604019165039 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055598, "balance_loss_mlp": 1.04669881, "diversity_loss_mlp": 0.0, "epoch": 0.8510965756060023, "flos": 583738472448.0, "grad_norm": 0.07136007632395135, "language_loss": 0.84349924, "learning_rate": 5.702721946364264e-05, "loss": 0.85405523, "num_input_tokens_seen": 367294768, "router_z_loss_mlp": 0.08898926, "routerloss_mlp": 0.0, "step": 4424, "time_per_iteration": 2.681556463241577 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056087, "balance_loss_mlp": 1.04699087, "diversity_loss_mlp": 0.0, "epoch": 0.8512889572912659, "flos": 600841400832.0, "grad_norm": 0.09439640399937352, "language_loss": 0.77805614, "learning_rate": 5.688281548923796e-05, "loss": 0.78861696, "num_input_tokens_seen": 367372368, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 4425, "time_per_iteration": 2.7769734859466553 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105464, "balance_loss_mlp": 1.04534197, "diversity_loss_mlp": 0.0, "epoch": 0.8514813389765294, "flos": 654791745024.0, "grad_norm": 0.06728138208507028, "language_loss": 0.78342903, "learning_rate": 5.673858354818151e-05, "loss": 0.79397547, "num_input_tokens_seen": 367452656, "router_z_loss_mlp": 0.09295654, "routerloss_mlp": 0.0, "step": 4426, "time_per_iteration": 2.878251075744629 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052363, "balance_loss_mlp": 1.04355907, "diversity_loss_mlp": 0.0, "epoch": 0.851673720661793, "flos": 429761811456.0, "grad_norm": 0.08229476351335695, "language_loss": 0.78530198, "learning_rate": 5.6594523696468726e-05, "loss": 0.7958256, "num_input_tokens_seen": 367517808, "router_z_loss_mlp": 0.08813477, "routerloss_mlp": 0.0, "step": 4427, "time_per_iteration": 2.51084041595459 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105621, "balance_loss_mlp": 1.04718578, "diversity_loss_mlp": 0.0, "epoch": 0.8518661023470565, "flos": 641572959744.0, "grad_norm": 0.06960729962592987, "language_loss": 0.79901236, "learning_rate": 5.645063599002875e-05, "loss": 0.80957448, "num_input_tokens_seen": 367591728, "router_z_loss_mlp": 0.09020996, "routerloss_mlp": 0.0, "step": 4428, "time_per_iteration": 2.7762057781219482 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055828, "balance_loss_mlp": 1.04680383, "diversity_loss_mlp": 0.0, "epoch": 0.8520584840323201, "flos": 562143504384.0, "grad_norm": 0.07302244449525275, "language_loss": 0.79662502, "learning_rate": 5.630692048472363e-05, "loss": 0.80718338, "num_input_tokens_seen": 367664496, "router_z_loss_mlp": 0.09020996, "routerloss_mlp": 0.0, "step": 4429, "time_per_iteration": 2.660036325454712 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056924, "balance_loss_mlp": 1.04789412, "diversity_loss_mlp": 0.0, "epoch": 0.8522508657175837, "flos": 527050395648.0, "grad_norm": 0.07546735542766958, "language_loss": 0.78632665, "learning_rate": 5.61633772363489e-05, "loss": 0.79689586, "num_input_tokens_seen": 367735584, "router_z_loss_mlp": 0.09033203, "routerloss_mlp": 0.0, "step": 4430, "time_per_iteration": 2.6127545833587646 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105219, "balance_loss_mlp": 1.04328537, "diversity_loss_mlp": 0.0, "epoch": 0.8524432474028473, "flos": 499120247808.0, "grad_norm": 0.06572867134879866, "language_loss": 0.80567098, "learning_rate": 5.602000630063298e-05, "loss": 0.81619287, "num_input_tokens_seen": 367801136, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 4431, "time_per_iteration": 2.5721845626831055 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053312, "balance_loss_mlp": 1.04428816, "diversity_loss_mlp": 0.0, "epoch": 0.8526356290881109, "flos": 421314048000.0, "grad_norm": 0.07674502364366044, "language_loss": 0.79846716, "learning_rate": 5.587680773323706e-05, "loss": 0.80900025, "num_input_tokens_seen": 367865312, "router_z_loss_mlp": 0.090271, "routerloss_mlp": 0.0, "step": 4432, "time_per_iteration": 2.510967493057251 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01057356, "balance_loss_mlp": 1.04839182, "diversity_loss_mlp": 0.0, "epoch": 0.8528280107733743, "flos": 507328303104.0, "grad_norm": 0.0698638093203012, "language_loss": 0.80873108, "learning_rate": 5.5733781589756115e-05, "loss": 0.8193047, "num_input_tokens_seen": 367931104, "router_z_loss_mlp": 0.08972168, "routerloss_mlp": 0.0, "step": 4433, "time_per_iteration": 2.6090145111083984 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054798, "balance_loss_mlp": 1.04608333, "diversity_loss_mlp": 0.0, "epoch": 0.8530203924586379, "flos": 445893797376.0, "grad_norm": 0.06627585566585331, "language_loss": 0.82683206, "learning_rate": 5.5590927925717684e-05, "loss": 0.83738005, "num_input_tokens_seen": 367995520, "router_z_loss_mlp": 0.0871582, "routerloss_mlp": 0.0, "step": 4434, "time_per_iteration": 2.5510103702545166 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055005, "balance_loss_mlp": 1.04617763, "diversity_loss_mlp": 0.0, "epoch": 0.8532127741439015, "flos": 657759273984.0, "grad_norm": 0.06848630308035882, "language_loss": 0.83932847, "learning_rate": 5.54482467965825e-05, "loss": 0.84987855, "num_input_tokens_seen": 368073664, "router_z_loss_mlp": 0.08831787, "routerloss_mlp": 0.0, "step": 4435, "time_per_iteration": 2.9127962589263916 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052958, "balance_loss_mlp": 1.0440768, "diversity_loss_mlp": 0.0, "epoch": 0.8534051558291651, "flos": 536019420672.0, "grad_norm": 0.07760386997403859, "language_loss": 0.83284372, "learning_rate": 5.5305738257744264e-05, "loss": 0.8433733, "num_input_tokens_seen": 368147536, "router_z_loss_mlp": 0.08892822, "routerloss_mlp": 0.0, "step": 4436, "time_per_iteration": 2.7183430194854736 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01056242, "balance_loss_mlp": 1.04731894, "diversity_loss_mlp": 0.0, "epoch": 0.8535975375144286, "flos": 533000134656.0, "grad_norm": 0.08897067825861743, "language_loss": 0.79124266, "learning_rate": 5.5163402364529655e-05, "loss": 0.80180502, "num_input_tokens_seen": 368218672, "router_z_loss_mlp": 0.08929443, "routerloss_mlp": 0.0, "step": 4437, "time_per_iteration": 2.6436634063720703 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051335, "balance_loss_mlp": 1.04229927, "diversity_loss_mlp": 0.0, "epoch": 0.8537899191996922, "flos": 574141727232.0, "grad_norm": 0.07034775984994458, "language_loss": 0.82836092, "learning_rate": 5.502123917219848e-05, "loss": 0.83887428, "num_input_tokens_seen": 368287056, "router_z_loss_mlp": 0.09033203, "routerloss_mlp": 0.0, "step": 4438, "time_per_iteration": 2.71964430809021 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105491, "balance_loss_mlp": 1.04575455, "diversity_loss_mlp": 0.0, "epoch": 0.8539823008849557, "flos": 465007993344.0, "grad_norm": 0.0746305826676403, "language_loss": 0.83321023, "learning_rate": 5.48792487359433e-05, "loss": 0.8437593, "num_input_tokens_seen": 368358400, "router_z_loss_mlp": 0.09155273, "routerloss_mlp": 0.0, "step": 4439, "time_per_iteration": 2.7270102500915527 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105129, "balance_loss_mlp": 1.04193783, "diversity_loss_mlp": 0.0, "epoch": 0.8541746825702193, "flos": 554713671168.0, "grad_norm": 0.11714515413286376, "language_loss": 0.81816977, "learning_rate": 5.4737431110889745e-05, "loss": 0.82868266, "num_input_tokens_seen": 368427168, "router_z_loss_mlp": 0.09350586, "routerloss_mlp": 0.0, "step": 4440, "time_per_iteration": 2.665386915206909 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047429, "balance_loss_mlp": 1.03834486, "diversity_loss_mlp": 0.0, "epoch": 0.8543670642554829, "flos": 546391816704.0, "grad_norm": 0.06595291509459175, "language_loss": 0.77334499, "learning_rate": 5.4595786352096165e-05, "loss": 0.78381932, "num_input_tokens_seen": 368503584, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 4441, "time_per_iteration": 2.7599966526031494 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049537, "balance_loss_mlp": 1.04063272, "diversity_loss_mlp": 0.0, "epoch": 0.8545594459407464, "flos": 512027744256.0, "grad_norm": 0.07060933653649062, "language_loss": 0.82500267, "learning_rate": 5.4454314514554236e-05, "loss": 0.83549809, "num_input_tokens_seen": 368576976, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 4442, "time_per_iteration": 2.639495372772217 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051049, "balance_loss_mlp": 1.04200077, "diversity_loss_mlp": 0.0, "epoch": 0.85475182762601, "flos": 421185567744.0, "grad_norm": 0.07063393477475531, "language_loss": 0.81464767, "learning_rate": 5.431301565318786e-05, "loss": 0.82515812, "num_input_tokens_seen": 368641664, "router_z_loss_mlp": 0.0904541, "routerloss_mlp": 0.0, "step": 4443, "time_per_iteration": 2.4978034496307373 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048968, "balance_loss_mlp": 1.03971708, "diversity_loss_mlp": 0.0, "epoch": 0.8549442093112736, "flos": 389435516928.0, "grad_norm": 0.08111118700719577, "language_loss": 0.77217865, "learning_rate": 5.41718898228542e-05, "loss": 0.78266835, "num_input_tokens_seen": 368705616, "router_z_loss_mlp": 0.09246826, "routerloss_mlp": 0.0, "step": 4444, "time_per_iteration": 2.4748144149780273 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050876, "balance_loss_mlp": 1.04197693, "diversity_loss_mlp": 0.0, "epoch": 0.8551365909965372, "flos": 605926282752.0, "grad_norm": 0.09368313437946132, "language_loss": 0.79476607, "learning_rate": 5.403093707834334e-05, "loss": 0.80527484, "num_input_tokens_seen": 368779664, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 4445, "time_per_iteration": 2.796154499053955 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049626, "balance_loss_mlp": 1.04050708, "diversity_loss_mlp": 0.0, "epoch": 0.8553289726818007, "flos": 504160713216.0, "grad_norm": 0.06371937907069437, "language_loss": 0.78714025, "learning_rate": 5.3890157474377865e-05, "loss": 0.79763651, "num_input_tokens_seen": 368846656, "router_z_loss_mlp": 0.09106445, "routerloss_mlp": 0.0, "step": 4446, "time_per_iteration": 2.5761666297912598 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051208, "balance_loss_mlp": 1.04208875, "diversity_loss_mlp": 0.0, "epoch": 0.8555213543670642, "flos": 557009063424.0, "grad_norm": 0.06774235964888489, "language_loss": 0.76389277, "learning_rate": 5.374955106561324e-05, "loss": 0.77440482, "num_input_tokens_seen": 368923712, "router_z_loss_mlp": 0.09112549, "routerloss_mlp": 0.0, "step": 4447, "time_per_iteration": 2.772761344909668 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050103, "balance_loss_mlp": 1.04116249, "diversity_loss_mlp": 0.0, "epoch": 0.8557137360523278, "flos": 548104278528.0, "grad_norm": 0.06327552262806617, "language_loss": 0.75251746, "learning_rate": 5.360911790663775e-05, "loss": 0.76301849, "num_input_tokens_seen": 368994496, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 4448, "time_per_iteration": 2.6334402561187744 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047773, "balance_loss_mlp": 1.03859377, "diversity_loss_mlp": 0.0, "epoch": 0.8559061177375914, "flos": 728182628352.0, "grad_norm": 0.057928896872347986, "language_loss": 0.78575248, "learning_rate": 5.346885805197238e-05, "loss": 0.7962302, "num_input_tokens_seen": 369077088, "router_z_loss_mlp": 0.09173584, "routerloss_mlp": 0.0, "step": 4449, "time_per_iteration": 2.965585947036743 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105067, "balance_loss_mlp": 1.0418725, "diversity_loss_mlp": 0.0, "epoch": 0.856098499422855, "flos": 535881028608.0, "grad_norm": 0.07751296058129717, "language_loss": 0.83346003, "learning_rate": 5.332877155607085e-05, "loss": 0.84396672, "num_input_tokens_seen": 369147680, "router_z_loss_mlp": 0.0880127, "routerloss_mlp": 0.0, "step": 4450, "time_per_iteration": 2.6572906970977783 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051985, "balance_loss_mlp": 1.04291868, "diversity_loss_mlp": 0.0, "epoch": 0.8562908811081185, "flos": 573664882176.0, "grad_norm": 0.06226038691697754, "language_loss": 0.83402085, "learning_rate": 5.3188858473319504e-05, "loss": 0.84454072, "num_input_tokens_seen": 369224320, "router_z_loss_mlp": 0.09063721, "routerloss_mlp": 0.0, "step": 4451, "time_per_iteration": 2.715268611907959 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050652, "balance_loss_mlp": 1.04167557, "diversity_loss_mlp": 0.0, "epoch": 0.856483262793382, "flos": 781754872320.0, "grad_norm": 0.07567123638772062, "language_loss": 0.80818313, "learning_rate": 5.3049118858037426e-05, "loss": 0.8186897, "num_input_tokens_seen": 369315744, "router_z_loss_mlp": 0.08984375, "routerloss_mlp": 0.0, "step": 4452, "time_per_iteration": 3.072892665863037 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104824, "balance_loss_mlp": 1.03925145, "diversity_loss_mlp": 0.0, "epoch": 0.8566756444786456, "flos": 455819083776.0, "grad_norm": 0.0664830695636331, "language_loss": 0.84927678, "learning_rate": 5.290955276447651e-05, "loss": 0.85975915, "num_input_tokens_seen": 369382800, "router_z_loss_mlp": 0.08990479, "routerloss_mlp": 0.0, "step": 4453, "time_per_iteration": 2.538435697555542 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048709, "balance_loss_mlp": 1.03954768, "diversity_loss_mlp": 0.0, "epoch": 0.8568680261639092, "flos": 449382587904.0, "grad_norm": 0.08569801456429596, "language_loss": 0.84562624, "learning_rate": 5.277016024682091e-05, "loss": 0.85611331, "num_input_tokens_seen": 369447312, "router_z_loss_mlp": 0.09161377, "routerloss_mlp": 0.0, "step": 4454, "time_per_iteration": 2.510293960571289 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045918, "balance_loss_mlp": 1.03693008, "diversity_loss_mlp": 0.0, "epoch": 0.8570604078491728, "flos": 479976316416.0, "grad_norm": 0.07456272936898871, "language_loss": 0.82575965, "learning_rate": 5.2630941359187665e-05, "loss": 0.83621883, "num_input_tokens_seen": 369512800, "router_z_loss_mlp": 0.08990479, "routerloss_mlp": 0.0, "step": 4455, "time_per_iteration": 2.5304741859436035 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00783782, "balance_loss_mlp": 1.32045674, "diversity_loss_mlp": 0.22576013, "epoch": 0.8572527895344363, "flos": 505942184448.0, "grad_norm": 0.031240053389996185, "language_loss": 0.85362232, "learning_rate": 5.249189615562627e-05, "loss": 0.86146021, "num_input_tokens_seen": 369580720, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01067326, "step": 4456, "time_per_iteration": 2.6050779819488525 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047609, "balance_loss_mlp": 1.03857875, "diversity_loss_mlp": 0.0, "epoch": 0.8574451712196999, "flos": 787044957696.0, "grad_norm": 0.05524865057671199, "language_loss": 0.83069348, "learning_rate": 5.235302469011905e-05, "loss": 0.84116954, "num_input_tokens_seen": 369672544, "router_z_loss_mlp": 0.09039307, "routerloss_mlp": 0.0, "step": 4457, "time_per_iteration": 3.0707337856292725 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046976, "balance_loss_mlp": 1.03807688, "diversity_loss_mlp": 0.0, "epoch": 0.8576375529049635, "flos": 509252935680.0, "grad_norm": 0.061549314191434064, "language_loss": 0.75128138, "learning_rate": 5.2214327016580575e-05, "loss": 0.76175112, "num_input_tokens_seen": 369745776, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 4458, "time_per_iteration": 2.8048369884490967 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01009207, "balance_loss_mlp": 1.00486779, "diversity_loss_mlp": 0.0, "epoch": 0.857829934590227, "flos": 1460772486144.0, "grad_norm": 0.009410723197847748, "language_loss": 0.84767288, "learning_rate": 5.207580318885802e-05, "loss": 0.85776496, "num_input_tokens_seen": 369975200, "router_z_loss_mlp": 0.04345703, "routerloss_mlp": 0.0, "step": 4459, "time_per_iteration": 5.052462339401245 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049117, "balance_loss_mlp": 1.04002094, "diversity_loss_mlp": 0.0, "epoch": 0.8580223162754905, "flos": 479296839168.0, "grad_norm": 0.05814228288805263, "language_loss": 0.89274621, "learning_rate": 5.193745326073118e-05, "loss": 0.90323746, "num_input_tokens_seen": 370043296, "router_z_loss_mlp": 0.09100342, "routerloss_mlp": 0.0, "step": 4460, "time_per_iteration": 2.707102060317993 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048266, "balance_loss_mlp": 1.03917027, "diversity_loss_mlp": 0.0, "epoch": 0.8582146979607541, "flos": 706231954944.0, "grad_norm": 0.07378533003990426, "language_loss": 0.7931006, "learning_rate": 5.179927728591227e-05, "loss": 0.80358326, "num_input_tokens_seen": 370111152, "router_z_loss_mlp": 0.09100342, "routerloss_mlp": 0.0, "step": 4461, "time_per_iteration": 2.865081310272217 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104748, "balance_loss_mlp": 1.03854513, "diversity_loss_mlp": 0.0, "epoch": 0.8584070796460177, "flos": 765158524416.0, "grad_norm": 0.06549370953575787, "language_loss": 0.823946, "learning_rate": 5.1661275318045874e-05, "loss": 0.8344208, "num_input_tokens_seen": 370190272, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 4462, "time_per_iteration": 2.960702419281006 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051966, "balance_loss_mlp": 1.04283428, "diversity_loss_mlp": 0.0, "epoch": 0.8585994613312813, "flos": 586829339136.0, "grad_norm": 0.07292053022403922, "language_loss": 0.85890585, "learning_rate": 5.152344741070919e-05, "loss": 0.86942554, "num_input_tokens_seen": 370267056, "router_z_loss_mlp": 0.09136963, "routerloss_mlp": 0.0, "step": 4463, "time_per_iteration": 2.795929193496704 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047374, "balance_loss_mlp": 1.03847504, "diversity_loss_mlp": 0.0, "epoch": 0.8587918430165449, "flos": 608295826944.0, "grad_norm": 0.0593280148984403, "language_loss": 0.78598225, "learning_rate": 5.138579361741169e-05, "loss": 0.79645598, "num_input_tokens_seen": 370344176, "router_z_loss_mlp": 0.08898926, "routerloss_mlp": 0.0, "step": 4464, "time_per_iteration": 2.81134033203125 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046787, "balance_loss_mlp": 1.03755462, "diversity_loss_mlp": 0.0, "epoch": 0.8589842247018084, "flos": 588981570048.0, "grad_norm": 0.08434589868858423, "language_loss": 0.80900252, "learning_rate": 5.124831399159535e-05, "loss": 0.81947035, "num_input_tokens_seen": 370414224, "router_z_loss_mlp": 0.09228516, "routerloss_mlp": 0.0, "step": 4465, "time_per_iteration": 2.698519229888916 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055861, "balance_loss_mlp": 1.04674125, "diversity_loss_mlp": 0.0, "epoch": 0.8591766063870719, "flos": 543879111168.0, "grad_norm": 0.08280689414498507, "language_loss": 0.78631306, "learning_rate": 5.1111008586634475e-05, "loss": 0.79687166, "num_input_tokens_seen": 370484736, "router_z_loss_mlp": 0.09124756, "routerloss_mlp": 0.0, "step": 4466, "time_per_iteration": 2.7119884490966797 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051995, "balance_loss_mlp": 1.04303014, "diversity_loss_mlp": 0.0, "epoch": 0.8593689880723355, "flos": 493756010496.0, "grad_norm": 0.0696773734857941, "language_loss": 0.80894464, "learning_rate": 5.0973877455835816e-05, "loss": 0.81946456, "num_input_tokens_seen": 370556512, "router_z_loss_mlp": 0.08972168, "routerloss_mlp": 0.0, "step": 4467, "time_per_iteration": 2.647484064102173 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053716, "balance_loss_mlp": 1.04451299, "diversity_loss_mlp": 0.0, "epoch": 0.8595613697575991, "flos": 533909408256.0, "grad_norm": 0.07756425408438049, "language_loss": 0.83735067, "learning_rate": 5.083692065243822e-05, "loss": 0.84788781, "num_input_tokens_seen": 370622880, "router_z_loss_mlp": 0.09197998, "routerloss_mlp": 0.0, "step": 4468, "time_per_iteration": 2.606961488723755 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050814, "balance_loss_mlp": 1.04189694, "diversity_loss_mlp": 0.0, "epoch": 0.8597537514428626, "flos": 617628271104.0, "grad_norm": 0.09275491108708087, "language_loss": 0.76113212, "learning_rate": 5.070013822961328e-05, "loss": 0.77164024, "num_input_tokens_seen": 370691632, "router_z_loss_mlp": 0.0892334, "routerloss_mlp": 0.0, "step": 4469, "time_per_iteration": 2.7252352237701416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044872, "balance_loss_mlp": 1.03569305, "diversity_loss_mlp": 0.0, "epoch": 0.8599461331281262, "flos": 608730826752.0, "grad_norm": 0.0715850887288851, "language_loss": 0.84056306, "learning_rate": 5.056353024046462e-05, "loss": 0.85101181, "num_input_tokens_seen": 370764848, "router_z_loss_mlp": 0.09173584, "routerloss_mlp": 0.0, "step": 4470, "time_per_iteration": 2.705986261367798 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105045, "balance_loss_mlp": 1.04136574, "diversity_loss_mlp": 0.0, "epoch": 0.8601385148133898, "flos": 551252044800.0, "grad_norm": 0.06285887675624062, "language_loss": 0.83157659, "learning_rate": 5.042709673802786e-05, "loss": 0.84208107, "num_input_tokens_seen": 370832496, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 4471, "time_per_iteration": 2.666837215423584 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049444, "balance_loss_mlp": 1.04027641, "diversity_loss_mlp": 0.0, "epoch": 0.8603308964986534, "flos": 581200800768.0, "grad_norm": 0.05893825733891097, "language_loss": 0.81146169, "learning_rate": 5.0290837775271494e-05, "loss": 0.8219561, "num_input_tokens_seen": 370917104, "router_z_loss_mlp": 0.09173584, "routerloss_mlp": 0.0, "step": 4472, "time_per_iteration": 2.8742566108703613 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048905, "balance_loss_mlp": 1.03975582, "diversity_loss_mlp": 0.0, "epoch": 0.8605232781839169, "flos": 629013828096.0, "grad_norm": 0.0784559569656679, "language_loss": 0.75468278, "learning_rate": 5.0154753405095846e-05, "loss": 0.76517183, "num_input_tokens_seen": 370984512, "router_z_loss_mlp": 0.0914917, "routerloss_mlp": 0.0, "step": 4473, "time_per_iteration": 2.7587168216705322 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049665, "balance_loss_mlp": 1.04089117, "diversity_loss_mlp": 0.0, "epoch": 0.8607156598691804, "flos": 468141078528.0, "grad_norm": 0.06949986804746215, "language_loss": 0.77037829, "learning_rate": 5.0018843680333604e-05, "loss": 0.78087491, "num_input_tokens_seen": 371049664, "router_z_loss_mlp": 0.08776855, "routerloss_mlp": 0.0, "step": 4474, "time_per_iteration": 2.6033754348754883 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046102, "balance_loss_mlp": 1.0372982, "diversity_loss_mlp": 0.0, "epoch": 0.860908041554444, "flos": 488394344448.0, "grad_norm": 0.06715849698858382, "language_loss": 0.82796544, "learning_rate": 4.988310865374945e-05, "loss": 0.83842647, "num_input_tokens_seen": 371120704, "router_z_loss_mlp": 0.08807373, "routerloss_mlp": 0.0, "step": 4475, "time_per_iteration": 2.6462340354919434 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045142, "balance_loss_mlp": 1.03617787, "diversity_loss_mlp": 0.0, "epoch": 0.8611004232397076, "flos": 592094831616.0, "grad_norm": 0.15717168716327404, "language_loss": 0.80459589, "learning_rate": 4.974754837804057e-05, "loss": 0.81504726, "num_input_tokens_seen": 371189376, "router_z_loss_mlp": 0.08972168, "routerloss_mlp": 0.0, "step": 4476, "time_per_iteration": 2.6762094497680664 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049439, "balance_loss_mlp": 1.03996754, "diversity_loss_mlp": 0.0, "epoch": 0.8612928049249712, "flos": 774209041920.0, "grad_norm": 0.06321855833863838, "language_loss": 0.86383665, "learning_rate": 4.9612162905836036e-05, "loss": 0.874331, "num_input_tokens_seen": 371275184, "router_z_loss_mlp": 0.09472656, "routerloss_mlp": 0.0, "step": 4477, "time_per_iteration": 3.0531985759735107 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104971, "balance_loss_mlp": 1.04053116, "diversity_loss_mlp": 0.0, "epoch": 0.8614851866102347, "flos": 537553843200.0, "grad_norm": 0.06893935293866559, "language_loss": 0.82464266, "learning_rate": 4.947695228969718e-05, "loss": 0.83513981, "num_input_tokens_seen": 371347920, "router_z_loss_mlp": 0.09179688, "routerloss_mlp": 0.0, "step": 4478, "time_per_iteration": 2.6873598098754883 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104944, "balance_loss_mlp": 1.04062486, "diversity_loss_mlp": 0.0, "epoch": 0.8616775682954982, "flos": 565916419584.0, "grad_norm": 0.0676917705812813, "language_loss": 0.7915647, "learning_rate": 4.934191658211729e-05, "loss": 0.80205905, "num_input_tokens_seen": 371419728, "router_z_loss_mlp": 0.08825684, "routerloss_mlp": 0.0, "step": 4479, "time_per_iteration": 2.6640400886535645 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049843, "balance_loss_mlp": 1.04052103, "diversity_loss_mlp": 0.0, "epoch": 0.8618699499807618, "flos": 481592231424.0, "grad_norm": 0.06998246415259375, "language_loss": 0.81843102, "learning_rate": 4.92070558355221e-05, "loss": 0.82892942, "num_input_tokens_seen": 371488768, "router_z_loss_mlp": 0.09320068, "routerloss_mlp": 0.0, "step": 4480, "time_per_iteration": 2.6465768814086914 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044397, "balance_loss_mlp": 1.0348897, "diversity_loss_mlp": 0.0, "epoch": 0.8620623316660254, "flos": 649506802176.0, "grad_norm": 0.09745126200827099, "language_loss": 0.74436772, "learning_rate": 4.9072370102269226e-05, "loss": 0.7548117, "num_input_tokens_seen": 371560144, "router_z_loss_mlp": 0.09490967, "routerloss_mlp": 0.0, "step": 4481, "time_per_iteration": 2.7863497734069824 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048433, "balance_loss_mlp": 1.03935492, "diversity_loss_mlp": 0.0, "epoch": 0.862254713351289, "flos": 751781523456.0, "grad_norm": 0.06946555375175803, "language_loss": 0.85534787, "learning_rate": 4.893785943464801e-05, "loss": 0.86583215, "num_input_tokens_seen": 371635920, "router_z_loss_mlp": 0.09082031, "routerloss_mlp": 0.0, "step": 4482, "time_per_iteration": 2.9774255752563477 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051438, "balance_loss_mlp": 1.04190743, "diversity_loss_mlp": 0.0, "epoch": 0.8624470950365525, "flos": 841543727616.0, "grad_norm": 0.07498520167107697, "language_loss": 0.77633011, "learning_rate": 4.880352388488024e-05, "loss": 0.78684449, "num_input_tokens_seen": 371727664, "router_z_loss_mlp": 0.09527588, "routerloss_mlp": 0.0, "step": 4483, "time_per_iteration": 3.2647812366485596 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00783832, "balance_loss_mlp": 1.32083893, "diversity_loss_mlp": 0.22531055, "epoch": 0.8626394767218161, "flos": 754793468928.0, "grad_norm": 0.03436935240738205, "language_loss": 0.83586842, "learning_rate": 4.866936350511969e-05, "loss": 0.84370679, "num_input_tokens_seen": 371800832, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01075701, "step": 4484, "time_per_iteration": 2.928110122680664 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048003, "balance_loss_mlp": 1.03885961, "diversity_loss_mlp": 0.0, "epoch": 0.8628318584070797, "flos": 703585626624.0, "grad_norm": 0.0696769189264069, "language_loss": 0.82539618, "learning_rate": 4.853537834745203e-05, "loss": 0.83587623, "num_input_tokens_seen": 371871472, "router_z_loss_mlp": 0.0914917, "routerloss_mlp": 0.0, "step": 4485, "time_per_iteration": 2.8806722164154053 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048081, "balance_loss_mlp": 1.0388062, "diversity_loss_mlp": 0.0, "epoch": 0.8630242400923432, "flos": 471244428288.0, "grad_norm": 0.07034386086507984, "language_loss": 0.77557874, "learning_rate": 4.840156846389487e-05, "loss": 0.7860595, "num_input_tokens_seen": 371936512, "router_z_loss_mlp": 0.09277344, "routerloss_mlp": 0.0, "step": 4486, "time_per_iteration": 2.5718491077423096 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045399, "balance_loss_mlp": 1.03601718, "diversity_loss_mlp": 0.0, "epoch": 0.8632166217776067, "flos": 964363553280.0, "grad_norm": 0.08075284630280707, "language_loss": 0.77191448, "learning_rate": 4.826793390639783e-05, "loss": 0.78236842, "num_input_tokens_seen": 372018032, "router_z_loss_mlp": 0.09375, "routerloss_mlp": 0.0, "step": 4487, "time_per_iteration": 3.206104040145874 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048614, "balance_loss_mlp": 1.03938758, "diversity_loss_mlp": 0.0, "epoch": 0.8634090034628703, "flos": 767913509376.0, "grad_norm": 0.07054996301110567, "language_loss": 0.78534716, "learning_rate": 4.813447472684246e-05, "loss": 0.79583335, "num_input_tokens_seen": 372092176, "router_z_loss_mlp": 0.09222412, "routerloss_mlp": 0.0, "step": 4488, "time_per_iteration": 2.933553695678711 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049286, "balance_loss_mlp": 1.03989816, "diversity_loss_mlp": 0.0, "epoch": 0.8636013851481339, "flos": 520591504896.0, "grad_norm": 0.07600335888626973, "language_loss": 0.83061361, "learning_rate": 4.800119097704214e-05, "loss": 0.84110641, "num_input_tokens_seen": 372166880, "router_z_loss_mlp": 0.09387207, "routerloss_mlp": 0.0, "step": 4489, "time_per_iteration": 2.7383370399475098 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046793, "balance_loss_mlp": 1.03779912, "diversity_loss_mlp": 0.0, "epoch": 0.8637937668333975, "flos": 632144342016.0, "grad_norm": 0.08034973175032056, "language_loss": 0.80326092, "learning_rate": 4.7868082708742324e-05, "loss": 0.81372881, "num_input_tokens_seen": 372234608, "router_z_loss_mlp": 0.09008789, "routerloss_mlp": 0.0, "step": 4490, "time_per_iteration": 2.734177827835083 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044623, "balance_loss_mlp": 1.03565812, "diversity_loss_mlp": 0.0, "epoch": 0.8639861485186611, "flos": 856094676480.0, "grad_norm": 0.057692915875148014, "language_loss": 0.76451778, "learning_rate": 4.773514997362e-05, "loss": 0.77496397, "num_input_tokens_seen": 372314704, "router_z_loss_mlp": 0.08972168, "routerloss_mlp": 0.0, "step": 4491, "time_per_iteration": 3.0788826942443848 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049145, "balance_loss_mlp": 1.04005527, "diversity_loss_mlp": 0.0, "epoch": 0.8641785302039245, "flos": 481261118976.0, "grad_norm": 0.07466724897853576, "language_loss": 0.77982771, "learning_rate": 4.7602392823284605e-05, "loss": 0.79031909, "num_input_tokens_seen": 372374848, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 4492, "time_per_iteration": 2.530029058456421 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048683, "balance_loss_mlp": 1.039379, "diversity_loss_mlp": 0.0, "epoch": 0.8643709118891881, "flos": 504637558272.0, "grad_norm": 0.07260420646457022, "language_loss": 0.80692542, "learning_rate": 4.746981130927675e-05, "loss": 0.81741226, "num_input_tokens_seen": 372442432, "router_z_loss_mlp": 0.09295654, "routerloss_mlp": 0.0, "step": 4493, "time_per_iteration": 2.577784538269043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00778204, "balance_loss_mlp": 1.31030798, "diversity_loss_mlp": 0.22490472, "epoch": 0.8645632935744517, "flos": 552368719872.0, "grad_norm": 0.03497904945521898, "language_loss": 0.82458371, "learning_rate": 4.733740548306908e-05, "loss": 0.83236575, "num_input_tokens_seen": 372520048, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01059737, "step": 4494, "time_per_iteration": 2.807935953140259 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047253, "balance_loss_mlp": 1.03800845, "diversity_loss_mlp": 0.0, "epoch": 0.8647556752597153, "flos": 524737751040.0, "grad_norm": 0.07146424710596733, "language_loss": 0.84123516, "learning_rate": 4.7205175396066336e-05, "loss": 0.8517077, "num_input_tokens_seen": 372587968, "router_z_loss_mlp": 0.09246826, "routerloss_mlp": 0.0, "step": 4495, "time_per_iteration": 2.5809860229492188 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043935, "balance_loss_mlp": 1.03464222, "diversity_loss_mlp": 0.0, "epoch": 0.8649480569449788, "flos": 787768851456.0, "grad_norm": 0.07059483757370776, "language_loss": 0.81995988, "learning_rate": 4.707312109960471e-05, "loss": 0.83039922, "num_input_tokens_seen": 372672544, "router_z_loss_mlp": 0.09295654, "routerloss_mlp": 0.0, "step": 4496, "time_per_iteration": 3.083287477493286 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104705, "balance_loss_mlp": 1.03781724, "diversity_loss_mlp": 0.0, "epoch": 0.8651404386302424, "flos": 763863810048.0, "grad_norm": 0.06772870422342313, "language_loss": 0.76696306, "learning_rate": 4.694124264495225e-05, "loss": 0.77743357, "num_input_tokens_seen": 372751296, "router_z_loss_mlp": 0.09228516, "routerloss_mlp": 0.0, "step": 4497, "time_per_iteration": 3.074000835418701 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045348, "balance_loss_mlp": 1.03595984, "diversity_loss_mlp": 0.0, "epoch": 0.865332820315506, "flos": 539893651968.0, "grad_norm": 0.07122639959522058, "language_loss": 0.82500464, "learning_rate": 4.680954008330851e-05, "loss": 0.83545816, "num_input_tokens_seen": 372825264, "router_z_loss_mlp": 0.09381104, "routerloss_mlp": 0.0, "step": 4498, "time_per_iteration": 2.7418711185455322 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01011015, "balance_loss_mlp": 1.00648534, "diversity_loss_mlp": 0.0, "epoch": 0.8655252020007695, "flos": 1476632830464.0, "grad_norm": 0.011864937591166903, "language_loss": 0.79174447, "learning_rate": 4.667801346580519e-05, "loss": 0.80185461, "num_input_tokens_seen": 373052000, "router_z_loss_mlp": 0.04541016, "routerloss_mlp": 0.0, "step": 4499, "time_per_iteration": 4.7632763385772705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044614, "balance_loss_mlp": 1.03568506, "diversity_loss_mlp": 0.0, "epoch": 0.8657175836860331, "flos": 517369586688.0, "grad_norm": 0.060500475018932964, "language_loss": 0.82638729, "learning_rate": 4.6546662843505396e-05, "loss": 0.83683342, "num_input_tokens_seen": 373124128, "router_z_loss_mlp": 0.08929443, "routerloss_mlp": 0.0, "step": 4500, "time_per_iteration": 2.673696756362915 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043007, "balance_loss_mlp": 1.0338217, "diversity_loss_mlp": 0.0, "epoch": 0.8659099653712966, "flos": 590523333120.0, "grad_norm": 0.07115245817272867, "language_loss": 0.80032218, "learning_rate": 4.641548826740394e-05, "loss": 0.81075215, "num_input_tokens_seen": 373195472, "router_z_loss_mlp": 0.09185791, "routerloss_mlp": 0.0, "step": 4501, "time_per_iteration": 2.7261881828308105 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104351, "balance_loss_mlp": 1.03443861, "diversity_loss_mlp": 0.0, "epoch": 0.8661023470565602, "flos": 590449181184.0, "grad_norm": 0.05583001645863395, "language_loss": 0.88010484, "learning_rate": 4.628448978842731e-05, "loss": 0.89054, "num_input_tokens_seen": 373273504, "router_z_loss_mlp": 0.09075928, "routerloss_mlp": 0.0, "step": 4502, "time_per_iteration": 2.8443400859832764 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043003, "balance_loss_mlp": 1.03399086, "diversity_loss_mlp": 0.0, "epoch": 0.8662947287418238, "flos": 567670726656.0, "grad_norm": 0.06991854339818697, "language_loss": 0.79483074, "learning_rate": 4.61536674574336e-05, "loss": 0.80526078, "num_input_tokens_seen": 373346032, "router_z_loss_mlp": 0.09008789, "routerloss_mlp": 0.0, "step": 4503, "time_per_iteration": 2.7233920097351074 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045559, "balance_loss_mlp": 1.0366962, "diversity_loss_mlp": 0.0, "epoch": 0.8664871104270874, "flos": 515929139712.0, "grad_norm": 0.06089898281543335, "language_loss": 0.82218802, "learning_rate": 4.6023021325212636e-05, "loss": 0.83264363, "num_input_tokens_seen": 373419968, "router_z_loss_mlp": 0.08874512, "routerloss_mlp": 0.0, "step": 4504, "time_per_iteration": 2.7425873279571533 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050274, "balance_loss_mlp": 1.04102361, "diversity_loss_mlp": 0.0, "epoch": 0.866679492112351, "flos": 557263452672.0, "grad_norm": 0.06301593457003249, "language_loss": 0.78539002, "learning_rate": 4.589255144248561e-05, "loss": 0.79589272, "num_input_tokens_seen": 373502448, "router_z_loss_mlp": 0.09240723, "routerloss_mlp": 0.0, "step": 4505, "time_per_iteration": 2.779776096343994 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044731, "balance_loss_mlp": 1.03568339, "diversity_loss_mlp": 0.0, "epoch": 0.8668718737976144, "flos": 722448004608.0, "grad_norm": 0.08053258741139525, "language_loss": 0.81561208, "learning_rate": 4.57622578599054e-05, "loss": 0.82605934, "num_input_tokens_seen": 373581184, "router_z_loss_mlp": 0.09057617, "routerloss_mlp": 0.0, "step": 4506, "time_per_iteration": 2.9221668243408203 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104516, "balance_loss_mlp": 1.03598642, "diversity_loss_mlp": 0.0, "epoch": 0.867064255482878, "flos": 600705580032.0, "grad_norm": 0.0716656508067539, "language_loss": 0.84894359, "learning_rate": 4.5632140628056705e-05, "loss": 0.85939521, "num_input_tokens_seen": 373652272, "router_z_loss_mlp": 0.09173584, "routerloss_mlp": 0.0, "step": 4507, "time_per_iteration": 2.72947359085083 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045976, "balance_loss_mlp": 1.03671956, "diversity_loss_mlp": 0.0, "epoch": 0.8672566371681416, "flos": 803527879680.0, "grad_norm": 0.06708434542706315, "language_loss": 0.76185739, "learning_rate": 4.550219979745529e-05, "loss": 0.77231717, "num_input_tokens_seen": 373734896, "router_z_loss_mlp": 0.09265137, "routerloss_mlp": 0.0, "step": 4508, "time_per_iteration": 3.0237209796905518 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044629, "balance_loss_mlp": 1.03565264, "diversity_loss_mlp": 0.0, "epoch": 0.8674490188534052, "flos": 627368177664.0, "grad_norm": 0.06518598780385719, "language_loss": 0.83932543, "learning_rate": 4.5372435418548905e-05, "loss": 0.84977174, "num_input_tokens_seen": 373806960, "router_z_loss_mlp": 0.08984375, "routerloss_mlp": 0.0, "step": 4509, "time_per_iteration": 2.755521059036255 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047969, "balance_loss_mlp": 1.03887904, "diversity_loss_mlp": 0.0, "epoch": 0.8676414005386687, "flos": 727831692288.0, "grad_norm": 0.0684158926680597, "language_loss": 0.86113983, "learning_rate": 4.524284754171615e-05, "loss": 0.87161952, "num_input_tokens_seen": 373888352, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 4510, "time_per_iteration": 3.0163121223449707 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046767, "balance_loss_mlp": 1.03768277, "diversity_loss_mlp": 0.0, "epoch": 0.8678337822239323, "flos": 539972573184.0, "grad_norm": 0.06806250868382878, "language_loss": 0.80556583, "learning_rate": 4.5113436217267765e-05, "loss": 0.81603348, "num_input_tokens_seen": 373962112, "router_z_loss_mlp": 0.09082031, "routerloss_mlp": 0.0, "step": 4511, "time_per_iteration": 2.7898309230804443 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045577, "balance_loss_mlp": 1.03637373, "diversity_loss_mlp": 0.0, "epoch": 0.8680261639091958, "flos": 507521023488.0, "grad_norm": 0.09053329692660277, "language_loss": 0.79419863, "learning_rate": 4.4984201495445744e-05, "loss": 0.80465442, "num_input_tokens_seen": 374028256, "router_z_loss_mlp": 0.09204102, "routerloss_mlp": 0.0, "step": 4512, "time_per_iteration": 2.579146385192871 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104619, "balance_loss_mlp": 1.03741062, "diversity_loss_mlp": 0.0, "epoch": 0.8682185455944594, "flos": 487126794240.0, "grad_norm": 0.06296584652642616, "language_loss": 0.80771571, "learning_rate": 4.4855143426423275e-05, "loss": 0.81817764, "num_input_tokens_seen": 374100080, "router_z_loss_mlp": 0.08782959, "routerloss_mlp": 0.0, "step": 4513, "time_per_iteration": 2.6543962955474854 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045095, "balance_loss_mlp": 1.03607059, "diversity_loss_mlp": 0.0, "epoch": 0.868410927279723, "flos": 603690361344.0, "grad_norm": 0.07075999679510799, "language_loss": 0.81035638, "learning_rate": 4.472626206030528e-05, "loss": 0.82080734, "num_input_tokens_seen": 374174368, "router_z_loss_mlp": 0.090271, "routerloss_mlp": 0.0, "step": 4514, "time_per_iteration": 2.7115249633789062 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104638, "balance_loss_mlp": 1.03727281, "diversity_loss_mlp": 0.0, "epoch": 0.8686033089649865, "flos": 1118985186816.0, "grad_norm": 0.08852072985797838, "language_loss": 0.84644556, "learning_rate": 4.4597557447127846e-05, "loss": 0.85690933, "num_input_tokens_seen": 374257328, "router_z_loss_mlp": 0.09106445, "routerloss_mlp": 0.0, "step": 4515, "time_per_iteration": 3.3953351974487305 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048525, "balance_loss_mlp": 1.03951859, "diversity_loss_mlp": 0.0, "epoch": 0.8687956906502501, "flos": 568019091456.0, "grad_norm": 0.09550241245969901, "language_loss": 0.83630067, "learning_rate": 4.446902963685862e-05, "loss": 0.8467859, "num_input_tokens_seen": 374327936, "router_z_loss_mlp": 0.09002686, "routerloss_mlp": 0.0, "step": 4516, "time_per_iteration": 2.7019460201263428 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046576, "balance_loss_mlp": 1.03759933, "diversity_loss_mlp": 0.0, "epoch": 0.8689880723355137, "flos": 544338703872.0, "grad_norm": 0.061078878472804264, "language_loss": 0.84983051, "learning_rate": 4.4340678679396454e-05, "loss": 0.86029625, "num_input_tokens_seen": 374400496, "router_z_loss_mlp": 0.08984375, "routerloss_mlp": 0.0, "step": 4517, "time_per_iteration": 2.6748125553131104 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050094, "balance_loss_mlp": 1.04121304, "diversity_loss_mlp": 0.0, "epoch": 0.8691804540207773, "flos": 457425086976.0, "grad_norm": 0.06941157706477712, "language_loss": 0.86215872, "learning_rate": 4.4212504624571495e-05, "loss": 0.87265968, "num_input_tokens_seen": 374470528, "router_z_loss_mlp": 0.08892822, "routerloss_mlp": 0.0, "step": 4518, "time_per_iteration": 2.580519914627075 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049225, "balance_loss_mlp": 1.0403676, "diversity_loss_mlp": 0.0, "epoch": 0.8693728357060407, "flos": 591872375808.0, "grad_norm": 0.060481411793616664, "language_loss": 0.79905188, "learning_rate": 4.40845075221456e-05, "loss": 0.80954409, "num_input_tokens_seen": 374542656, "router_z_loss_mlp": 0.08868408, "routerloss_mlp": 0.0, "step": 4519, "time_per_iteration": 2.739733934402466 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049186, "balance_loss_mlp": 1.04021573, "diversity_loss_mlp": 0.0, "epoch": 0.8695652173913043, "flos": 680263515648.0, "grad_norm": 0.08287606201497805, "language_loss": 0.79479718, "learning_rate": 4.395668742181164e-05, "loss": 0.80528903, "num_input_tokens_seen": 374617232, "router_z_loss_mlp": 0.08978271, "routerloss_mlp": 0.0, "step": 4520, "time_per_iteration": 2.8706867694854736 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050204, "balance_loss_mlp": 1.04147816, "diversity_loss_mlp": 0.0, "epoch": 0.8697575990765679, "flos": 492362551296.0, "grad_norm": 0.06861911538387308, "language_loss": 0.7854861, "learning_rate": 4.38290443731934e-05, "loss": 0.7959882, "num_input_tokens_seen": 374681888, "router_z_loss_mlp": 0.08728027, "routerloss_mlp": 0.0, "step": 4521, "time_per_iteration": 2.5845677852630615 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051004, "balance_loss_mlp": 1.0421524, "diversity_loss_mlp": 0.0, "epoch": 0.8699499807618315, "flos": 526949079552.0, "grad_norm": 0.0587255823279189, "language_loss": 0.82027864, "learning_rate": 4.370157842584671e-05, "loss": 0.83078861, "num_input_tokens_seen": 374750464, "router_z_loss_mlp": 0.08856201, "routerloss_mlp": 0.0, "step": 4522, "time_per_iteration": 2.7062559127807617 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047666, "balance_loss_mlp": 1.03883255, "diversity_loss_mlp": 0.0, "epoch": 0.8701423624470951, "flos": 814342616064.0, "grad_norm": 0.07380194299564537, "language_loss": 0.80566227, "learning_rate": 4.357428962925808e-05, "loss": 0.81613898, "num_input_tokens_seen": 374836064, "router_z_loss_mlp": 0.08850098, "routerloss_mlp": 0.0, "step": 4523, "time_per_iteration": 3.1326324939727783 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050998, "balance_loss_mlp": 1.04187274, "diversity_loss_mlp": 0.0, "epoch": 0.8703347441323586, "flos": 556789178880.0, "grad_norm": 0.06623832108710956, "language_loss": 0.88391662, "learning_rate": 4.344717803284542e-05, "loss": 0.89442658, "num_input_tokens_seen": 374903392, "router_z_loss_mlp": 0.09124756, "routerloss_mlp": 0.0, "step": 4524, "time_per_iteration": 2.684760808944702 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048562, "balance_loss_mlp": 1.03950179, "diversity_loss_mlp": 0.0, "epoch": 0.8705271258176221, "flos": 585443220480.0, "grad_norm": 0.06258298642895538, "language_loss": 0.84498411, "learning_rate": 4.3320243685957825e-05, "loss": 0.8554697, "num_input_tokens_seen": 374985904, "router_z_loss_mlp": 0.09057617, "routerloss_mlp": 0.0, "step": 4525, "time_per_iteration": 2.8076937198638916 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050394, "balance_loss_mlp": 1.04153669, "diversity_loss_mlp": 0.0, "epoch": 0.8707195075028857, "flos": 669216411648.0, "grad_norm": 0.058503085061922935, "language_loss": 0.85245442, "learning_rate": 4.3193486637875536e-05, "loss": 0.86295837, "num_input_tokens_seen": 375062992, "router_z_loss_mlp": 0.08856201, "routerloss_mlp": 0.0, "step": 4526, "time_per_iteration": 2.938445806503296 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045477, "balance_loss_mlp": 1.03660226, "diversity_loss_mlp": 0.0, "epoch": 0.8709118891881493, "flos": 520391443968.0, "grad_norm": 0.06425490678836035, "language_loss": 0.83926785, "learning_rate": 4.306690693781007e-05, "loss": 0.84972262, "num_input_tokens_seen": 375139296, "router_z_loss_mlp": 0.08880615, "routerloss_mlp": 0.0, "step": 4527, "time_per_iteration": 2.759881019592285 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104833, "balance_loss_mlp": 1.03936505, "diversity_loss_mlp": 0.0, "epoch": 0.8711042708734128, "flos": 553208984064.0, "grad_norm": 0.07304239619490156, "language_loss": 0.81745154, "learning_rate": 4.294050463490401e-05, "loss": 0.8279348, "num_input_tokens_seen": 375206576, "router_z_loss_mlp": 0.08966064, "routerloss_mlp": 0.0, "step": 4528, "time_per_iteration": 2.6849725246429443 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048731, "balance_loss_mlp": 1.04004014, "diversity_loss_mlp": 0.0, "epoch": 0.8712966525586764, "flos": 502193862144.0, "grad_norm": 0.08116186300687973, "language_loss": 0.82389712, "learning_rate": 4.281427977823094e-05, "loss": 0.83438438, "num_input_tokens_seen": 375279008, "router_z_loss_mlp": 0.08703613, "routerloss_mlp": 0.0, "step": 4529, "time_per_iteration": 2.721444606781006 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047464, "balance_loss_mlp": 1.03866649, "diversity_loss_mlp": 0.0, "epoch": 0.87148903424394, "flos": 804096129024.0, "grad_norm": 0.0788947608454547, "language_loss": 0.73803437, "learning_rate": 4.268823241679593e-05, "loss": 0.74850899, "num_input_tokens_seen": 375368512, "router_z_loss_mlp": 0.0880127, "routerloss_mlp": 0.0, "step": 4530, "time_per_iteration": 3.0360207557678223 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047532, "balance_loss_mlp": 1.03866839, "diversity_loss_mlp": 0.0, "epoch": 0.8716814159292036, "flos": 773438160384.0, "grad_norm": 0.061803367683131466, "language_loss": 0.86130869, "learning_rate": 4.256236259953489e-05, "loss": 0.87178397, "num_input_tokens_seen": 375450528, "router_z_loss_mlp": 0.08868408, "routerloss_mlp": 0.0, "step": 4531, "time_per_iteration": 3.0060312747955322 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051694, "balance_loss_mlp": 1.04256225, "diversity_loss_mlp": 0.0, "epoch": 0.8718737976144671, "flos": 486835329024.0, "grad_norm": 0.08097144635360554, "language_loss": 0.85292768, "learning_rate": 4.243667037531468e-05, "loss": 0.86344463, "num_input_tokens_seen": 375518256, "router_z_loss_mlp": 0.09124756, "routerloss_mlp": 0.0, "step": 4532, "time_per_iteration": 2.5708203315734863 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01042692, "balance_loss_mlp": 1.03403783, "diversity_loss_mlp": 0.0, "epoch": 0.8720661792997306, "flos": 584123913216.0, "grad_norm": 0.07173781512264084, "language_loss": 0.7855528, "learning_rate": 4.2311155792933264e-05, "loss": 0.79597974, "num_input_tokens_seen": 375588112, "router_z_loss_mlp": 0.08660889, "routerloss_mlp": 0.0, "step": 4533, "time_per_iteration": 2.714898109436035 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0100683, "balance_loss_mlp": 1.00234771, "diversity_loss_mlp": 0.0, "epoch": 0.8722585609849942, "flos": 1495942318080.0, "grad_norm": 0.011018751042369157, "language_loss": 0.80966806, "learning_rate": 4.2185818901119946e-05, "loss": 0.81973636, "num_input_tokens_seen": 375814496, "router_z_loss_mlp": 0.04492188, "routerloss_mlp": 0.0, "step": 4534, "time_per_iteration": 4.830231189727783 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046788, "balance_loss_mlp": 1.03760934, "diversity_loss_mlp": 0.0, "epoch": 0.8724509426702578, "flos": 596169123840.0, "grad_norm": 0.0639859938433398, "language_loss": 0.87151349, "learning_rate": 4.206065974853479e-05, "loss": 0.88198137, "num_input_tokens_seen": 375885440, "router_z_loss_mlp": 0.09179688, "routerloss_mlp": 0.0, "step": 4535, "time_per_iteration": 2.7394185066223145 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044463, "balance_loss_mlp": 1.03511095, "diversity_loss_mlp": 0.0, "epoch": 0.8726433243555214, "flos": 443635481088.0, "grad_norm": 0.07410951797613952, "language_loss": 0.80976605, "learning_rate": 4.193567838376888e-05, "loss": 0.8202107, "num_input_tokens_seen": 375952640, "router_z_loss_mlp": 0.09344482, "routerloss_mlp": 0.0, "step": 4536, "time_per_iteration": 2.5781943798065186 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048364, "balance_loss_mlp": 1.03959656, "diversity_loss_mlp": 0.0, "epoch": 0.8728357060407849, "flos": 553181819904.0, "grad_norm": 0.07408162868136768, "language_loss": 0.82072723, "learning_rate": 4.181087485534402e-05, "loss": 0.83121085, "num_input_tokens_seen": 376021648, "router_z_loss_mlp": 0.08776855, "routerloss_mlp": 0.0, "step": 4537, "time_per_iteration": 2.6797525882720947 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046083, "balance_loss_mlp": 1.03713083, "diversity_loss_mlp": 0.0, "epoch": 0.8730280877260485, "flos": 627807946752.0, "grad_norm": 0.07156355175880628, "language_loss": 0.78797638, "learning_rate": 4.16862492117136e-05, "loss": 0.79843724, "num_input_tokens_seen": 376102304, "router_z_loss_mlp": 0.08959961, "routerloss_mlp": 0.0, "step": 4538, "time_per_iteration": 2.8440496921539307 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047687, "balance_loss_mlp": 1.03858507, "diversity_loss_mlp": 0.0, "epoch": 0.873220469411312, "flos": 535384359936.0, "grad_norm": 0.0722387407949978, "language_loss": 0.79965913, "learning_rate": 4.156180150126143e-05, "loss": 0.81013602, "num_input_tokens_seen": 376177072, "router_z_loss_mlp": 0.09106445, "routerloss_mlp": 0.0, "step": 4539, "time_per_iteration": 2.721238136291504 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050694, "balance_loss_mlp": 1.04186094, "diversity_loss_mlp": 0.0, "epoch": 0.8734128510965756, "flos": 561883972608.0, "grad_norm": 0.12124336335781533, "language_loss": 0.84041327, "learning_rate": 4.143753177230242e-05, "loss": 0.8509202, "num_input_tokens_seen": 376251376, "router_z_loss_mlp": 0.08837891, "routerloss_mlp": 0.0, "step": 4540, "time_per_iteration": 2.6914098262786865 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045135, "balance_loss_mlp": 1.03622985, "diversity_loss_mlp": 0.0, "epoch": 0.8736052327818392, "flos": 686467643904.0, "grad_norm": 0.07799885017860995, "language_loss": 0.79752243, "learning_rate": 4.131344007308224e-05, "loss": 0.80797374, "num_input_tokens_seen": 376337104, "router_z_loss_mlp": 0.08905029, "routerloss_mlp": 0.0, "step": 4541, "time_per_iteration": 2.93182110786438 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048792, "balance_loss_mlp": 1.03960705, "diversity_loss_mlp": 0.0, "epoch": 0.8737976144671027, "flos": 531673113600.0, "grad_norm": 0.06451256022818536, "language_loss": 0.81514108, "learning_rate": 4.1189526451777816e-05, "loss": 0.82562894, "num_input_tokens_seen": 376415456, "router_z_loss_mlp": 0.09185791, "routerloss_mlp": 0.0, "step": 4542, "time_per_iteration": 2.8326876163482666 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00778397, "balance_loss_mlp": 1.31047845, "diversity_loss_mlp": 0.22450379, "epoch": 0.8739899961523663, "flos": 575592086016.0, "grad_norm": 0.03126791623306444, "language_loss": 0.81873107, "learning_rate": 4.106579095649649e-05, "loss": 0.82651508, "num_input_tokens_seen": 376494880, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01090602, "step": 4543, "time_per_iteration": 2.9323105812072754 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048028, "balance_loss_mlp": 1.03904009, "diversity_loss_mlp": 0.0, "epoch": 0.8741823778376299, "flos": 731332965888.0, "grad_norm": 0.09261999312040192, "language_loss": 0.76578218, "learning_rate": 4.094223363527666e-05, "loss": 0.77626246, "num_input_tokens_seen": 376571760, "router_z_loss_mlp": 0.08990479, "routerloss_mlp": 0.0, "step": 4544, "time_per_iteration": 2.8980069160461426 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104863, "balance_loss_mlp": 1.03955245, "diversity_loss_mlp": 0.0, "epoch": 0.8743747595228935, "flos": 567080082432.0, "grad_norm": 0.06860768160110936, "language_loss": 0.83654785, "learning_rate": 4.081885453608747e-05, "loss": 0.84703422, "num_input_tokens_seen": 376644464, "router_z_loss_mlp": 0.09075928, "routerloss_mlp": 0.0, "step": 4545, "time_per_iteration": 2.7457897663116455 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049582, "balance_loss_mlp": 1.04058218, "diversity_loss_mlp": 0.0, "epoch": 0.8745671412081569, "flos": 493370569728.0, "grad_norm": 0.06696244649326027, "language_loss": 0.82145166, "learning_rate": 4.0695653706829095e-05, "loss": 0.83194745, "num_input_tokens_seen": 376709584, "router_z_loss_mlp": 0.09008789, "routerloss_mlp": 0.0, "step": 4546, "time_per_iteration": 2.5956528186798096 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050136, "balance_loss_mlp": 1.04104638, "diversity_loss_mlp": 0.0, "epoch": 0.8747595228934205, "flos": 524139766272.0, "grad_norm": 0.06814063729509118, "language_loss": 0.83736241, "learning_rate": 4.057263119533233e-05, "loss": 0.84786379, "num_input_tokens_seen": 376779472, "router_z_loss_mlp": 0.09088135, "routerloss_mlp": 0.0, "step": 4547, "time_per_iteration": 2.6598734855651855 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104913, "balance_loss_mlp": 1.04005837, "diversity_loss_mlp": 0.0, "epoch": 0.8749519045786841, "flos": 744349118976.0, "grad_norm": 0.07262523755606552, "language_loss": 0.80276871, "learning_rate": 4.044978704935853e-05, "loss": 0.81325996, "num_input_tokens_seen": 376863408, "router_z_loss_mlp": 0.09069824, "routerloss_mlp": 0.0, "step": 4548, "time_per_iteration": 3.042619466781616 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054339, "balance_loss_mlp": 1.04545808, "diversity_loss_mlp": 0.0, "epoch": 0.8751442862639477, "flos": 594278995968.0, "grad_norm": 0.0643557055974673, "language_loss": 0.79893917, "learning_rate": 4.032712131660027e-05, "loss": 0.80948257, "num_input_tokens_seen": 376942080, "router_z_loss_mlp": 0.08880615, "routerloss_mlp": 0.0, "step": 4549, "time_per_iteration": 2.8232662677764893 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045807, "balance_loss_mlp": 1.03677678, "diversity_loss_mlp": 0.0, "epoch": 0.8753366679492113, "flos": 496530819072.0, "grad_norm": 0.06974853076229501, "language_loss": 0.78530467, "learning_rate": 4.020463404468055e-05, "loss": 0.79576278, "num_input_tokens_seen": 377015696, "router_z_loss_mlp": 0.09039307, "routerloss_mlp": 0.0, "step": 4550, "time_per_iteration": 2.7248096466064453 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046831, "balance_loss_mlp": 1.03792024, "diversity_loss_mlp": 0.0, "epoch": 0.8755290496344748, "flos": 489864526848.0, "grad_norm": 0.08026438876668639, "language_loss": 0.81858146, "learning_rate": 4.0082325281153074e-05, "loss": 0.82904983, "num_input_tokens_seen": 377081424, "router_z_loss_mlp": 0.08917236, "routerloss_mlp": 0.0, "step": 4551, "time_per_iteration": 2.563875436782837 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046474, "balance_loss_mlp": 1.03774762, "diversity_loss_mlp": 0.0, "epoch": 0.8757214313197383, "flos": 591859892736.0, "grad_norm": 0.27955745224323525, "language_loss": 0.81637728, "learning_rate": 3.9960195073502345e-05, "loss": 0.82684195, "num_input_tokens_seen": 377159360, "router_z_loss_mlp": 0.08740234, "routerloss_mlp": 0.0, "step": 4552, "time_per_iteration": 2.810784339904785 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048687, "balance_loss_mlp": 1.03973484, "diversity_loss_mlp": 0.0, "epoch": 0.8759138130050019, "flos": 976843763712.0, "grad_norm": 0.0711083365968444, "language_loss": 0.78033483, "learning_rate": 3.9838243469143555e-05, "loss": 0.79082167, "num_input_tokens_seen": 377240704, "router_z_loss_mlp": 0.08966064, "routerloss_mlp": 0.0, "step": 4553, "time_per_iteration": 3.2460765838623047 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048489, "balance_loss_mlp": 1.03957188, "diversity_loss_mlp": 0.0, "epoch": 0.8761061946902655, "flos": 802764338688.0, "grad_norm": 0.05712124953956382, "language_loss": 0.77816379, "learning_rate": 3.971647051542243e-05, "loss": 0.78864872, "num_input_tokens_seen": 377324176, "router_z_loss_mlp": 0.08929443, "routerloss_mlp": 0.0, "step": 4554, "time_per_iteration": 3.0767805576324463 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046716, "balance_loss_mlp": 1.03772795, "diversity_loss_mlp": 0.0, "epoch": 0.8762985763755291, "flos": 698495602176.0, "grad_norm": 0.0721600968568646, "language_loss": 0.74639142, "learning_rate": 3.95948762596155e-05, "loss": 0.75685859, "num_input_tokens_seen": 377403440, "router_z_loss_mlp": 0.08996582, "routerloss_mlp": 0.0, "step": 4555, "time_per_iteration": 2.9832050800323486 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052245, "balance_loss_mlp": 1.04343569, "diversity_loss_mlp": 0.0, "epoch": 0.8764909580607926, "flos": 629717898240.0, "grad_norm": 0.06902673277726463, "language_loss": 0.80373311, "learning_rate": 3.9473460748929765e-05, "loss": 0.8142556, "num_input_tokens_seen": 377483440, "router_z_loss_mlp": 0.08825684, "routerloss_mlp": 0.0, "step": 4556, "time_per_iteration": 2.8642075061798096 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047751, "balance_loss_mlp": 1.03882241, "diversity_loss_mlp": 0.0, "epoch": 0.8766833397460562, "flos": 481545243648.0, "grad_norm": 0.06429651244751071, "language_loss": 0.80069965, "learning_rate": 3.935222403050304e-05, "loss": 0.81117713, "num_input_tokens_seen": 377554688, "router_z_loss_mlp": 0.0892334, "routerloss_mlp": 0.0, "step": 4557, "time_per_iteration": 2.6734185218811035 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048146, "balance_loss_mlp": 1.03912759, "diversity_loss_mlp": 0.0, "epoch": 0.8768757214313198, "flos": 407734414848.0, "grad_norm": 0.06573901979402896, "language_loss": 0.78168076, "learning_rate": 3.923116615140354e-05, "loss": 0.79216218, "num_input_tokens_seen": 377617616, "router_z_loss_mlp": 0.09020996, "routerloss_mlp": 0.0, "step": 4558, "time_per_iteration": 2.5166428089141846 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049801, "balance_loss_mlp": 1.04095614, "diversity_loss_mlp": 0.0, "epoch": 0.8770681031165833, "flos": 582582150144.0, "grad_norm": 0.0842466180792191, "language_loss": 0.8216058, "learning_rate": 3.9110287158630076e-05, "loss": 0.83210379, "num_input_tokens_seen": 377685888, "router_z_loss_mlp": 0.08850098, "routerloss_mlp": 0.0, "step": 4559, "time_per_iteration": 2.666722536087036 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050514, "balance_loss_mlp": 1.04134107, "diversity_loss_mlp": 0.0, "epoch": 0.8772604848018468, "flos": 508687257600.0, "grad_norm": 0.07334962326293068, "language_loss": 0.80860007, "learning_rate": 3.8989587099111875e-05, "loss": 0.81910527, "num_input_tokens_seen": 377755744, "router_z_loss_mlp": 0.09173584, "routerloss_mlp": 0.0, "step": 4560, "time_per_iteration": 2.627713441848755 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050224, "balance_loss_mlp": 1.04125929, "diversity_loss_mlp": 0.0, "epoch": 0.8774528664871104, "flos": 408836408832.0, "grad_norm": 0.07694067808462435, "language_loss": 0.8510192, "learning_rate": 3.886906601970913e-05, "loss": 0.86152148, "num_input_tokens_seen": 377818880, "router_z_loss_mlp": 0.08966064, "routerloss_mlp": 0.0, "step": 4561, "time_per_iteration": 2.5129141807556152 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049748, "balance_loss_mlp": 1.04076576, "diversity_loss_mlp": 0.0, "epoch": 0.877645248172374, "flos": 500844819456.0, "grad_norm": 0.05712308761867227, "language_loss": 0.83274788, "learning_rate": 3.8748723967212184e-05, "loss": 0.84324539, "num_input_tokens_seen": 377893280, "router_z_loss_mlp": 0.08984375, "routerloss_mlp": 0.0, "step": 4562, "time_per_iteration": 2.6301164627075195 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00775546, "balance_loss_mlp": 1.3038888, "diversity_loss_mlp": 0.22576925, "epoch": 0.8778376298576376, "flos": 633145019904.0, "grad_norm": 0.034853936620068894, "language_loss": 0.7813766, "learning_rate": 3.862856098834189e-05, "loss": 0.78913212, "num_input_tokens_seen": 377972912, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01071687, "step": 4563, "time_per_iteration": 2.876042604446411 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01055367, "balance_loss_mlp": 1.04642081, "diversity_loss_mlp": 0.0, "epoch": 0.8780300115429012, "flos": 533988329472.0, "grad_norm": 0.06747212929306415, "language_loss": 0.80067873, "learning_rate": 3.850857712974976e-05, "loss": 0.81123239, "num_input_tokens_seen": 378054000, "router_z_loss_mlp": 0.08959961, "routerloss_mlp": 0.0, "step": 4564, "time_per_iteration": 2.8073532581329346 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052003, "balance_loss_mlp": 1.04328895, "diversity_loss_mlp": 0.0, "epoch": 0.8782223932281646, "flos": 511662127104.0, "grad_norm": 0.06003904599639906, "language_loss": 0.77326131, "learning_rate": 3.838877243801758e-05, "loss": 0.78378129, "num_input_tokens_seen": 378120336, "router_z_loss_mlp": 0.08728027, "routerloss_mlp": 0.0, "step": 4565, "time_per_iteration": 2.6049962043762207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050973, "balance_loss_mlp": 1.04202604, "diversity_loss_mlp": 0.0, "epoch": 0.8784147749134282, "flos": 780714547200.0, "grad_norm": 0.064833498730125, "language_loss": 0.70079195, "learning_rate": 3.826914695965766e-05, "loss": 0.71130168, "num_input_tokens_seen": 378216672, "router_z_loss_mlp": 0.08953857, "routerloss_mlp": 0.0, "step": 4566, "time_per_iteration": 3.1731789112091064 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00786853, "balance_loss_mlp": 1.32932496, "diversity_loss_mlp": 0.22292963, "epoch": 0.8786071565986918, "flos": 561004434432.0, "grad_norm": 0.0397840730750478, "language_loss": 0.76011282, "learning_rate": 3.814970074111279e-05, "loss": 0.76798129, "num_input_tokens_seen": 378287536, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01072608, "step": 4567, "time_per_iteration": 2.697258472442627 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050311, "balance_loss_mlp": 1.04135227, "diversity_loss_mlp": 0.0, "epoch": 0.8787995382839554, "flos": 603448081920.0, "grad_norm": 0.06722529563230402, "language_loss": 0.77491319, "learning_rate": 3.8030433828755926e-05, "loss": 0.78541636, "num_input_tokens_seen": 378362128, "router_z_loss_mlp": 0.08966064, "routerloss_mlp": 0.0, "step": 4568, "time_per_iteration": 2.840650796890259 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050171, "balance_loss_mlp": 1.04145098, "diversity_loss_mlp": 0.0, "epoch": 0.8789919199692189, "flos": 560233552896.0, "grad_norm": 0.05883368445240149, "language_loss": 0.8492918, "learning_rate": 3.7911346268890924e-05, "loss": 0.85979354, "num_input_tokens_seen": 378435696, "router_z_loss_mlp": 0.08728027, "routerloss_mlp": 0.0, "step": 4569, "time_per_iteration": 2.6557326316833496 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051532, "balance_loss_mlp": 1.04278803, "diversity_loss_mlp": 0.0, "epoch": 0.8791843016544825, "flos": 539115429888.0, "grad_norm": 0.07943052402500107, "language_loss": 0.8255586, "learning_rate": 3.7792438107751405e-05, "loss": 0.83607388, "num_input_tokens_seen": 378505664, "router_z_loss_mlp": 0.08746338, "routerloss_mlp": 0.0, "step": 4570, "time_per_iteration": 2.627609968185425 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053016, "balance_loss_mlp": 1.04396188, "diversity_loss_mlp": 0.0, "epoch": 0.8793766833397461, "flos": 1008699899904.0, "grad_norm": 0.06059091910308417, "language_loss": 0.79351205, "learning_rate": 3.767370939150167e-05, "loss": 0.80404216, "num_input_tokens_seen": 378598016, "router_z_loss_mlp": 0.09057617, "routerloss_mlp": 0.0, "step": 4571, "time_per_iteration": 3.35367751121521 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052417, "balance_loss_mlp": 1.0433991, "diversity_loss_mlp": 0.0, "epoch": 0.8795690650250096, "flos": 678637688832.0, "grad_norm": 0.06539899330048332, "language_loss": 0.80981296, "learning_rate": 3.755516016623628e-05, "loss": 0.82033718, "num_input_tokens_seen": 378676176, "router_z_loss_mlp": 0.09020996, "routerloss_mlp": 0.0, "step": 4572, "time_per_iteration": 2.880627155303955 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104993, "balance_loss_mlp": 1.0410192, "diversity_loss_mlp": 0.0, "epoch": 0.8797614467102732, "flos": 453432287232.0, "grad_norm": 0.07570874184627417, "language_loss": 0.88668913, "learning_rate": 3.7436790477980157e-05, "loss": 0.89718843, "num_input_tokens_seen": 378737952, "router_z_loss_mlp": 0.0892334, "routerloss_mlp": 0.0, "step": 4573, "time_per_iteration": 2.563573122024536 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051581, "balance_loss_mlp": 1.04257524, "diversity_loss_mlp": 0.0, "epoch": 0.8799538283955367, "flos": 550913591808.0, "grad_norm": 0.06673280620392491, "language_loss": 0.84119153, "learning_rate": 3.7318600372688526e-05, "loss": 0.8517074, "num_input_tokens_seen": 378806704, "router_z_loss_mlp": 0.09002686, "routerloss_mlp": 0.0, "step": 4574, "time_per_iteration": 2.6805808544158936 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052755, "balance_loss_mlp": 1.04388571, "diversity_loss_mlp": 0.0, "epoch": 0.8801462100808003, "flos": 807429275136.0, "grad_norm": 0.07043061387858378, "language_loss": 0.84413314, "learning_rate": 3.720058989624681e-05, "loss": 0.85466063, "num_input_tokens_seen": 378887616, "router_z_loss_mlp": 0.08880615, "routerloss_mlp": 0.0, "step": 4575, "time_per_iteration": 3.049510955810547 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051216, "balance_loss_mlp": 1.04210222, "diversity_loss_mlp": 0.0, "epoch": 0.8803385917660639, "flos": 768694302720.0, "grad_norm": 0.06156041987406192, "language_loss": 0.84676832, "learning_rate": 3.708275909447079e-05, "loss": 0.85728043, "num_input_tokens_seen": 378964656, "router_z_loss_mlp": 0.09106445, "routerloss_mlp": 0.0, "step": 4576, "time_per_iteration": 2.931907892227173 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050983, "balance_loss_mlp": 1.04205978, "diversity_loss_mlp": 0.0, "epoch": 0.8805309734513275, "flos": 567339614208.0, "grad_norm": 0.05826624297126263, "language_loss": 0.81173784, "learning_rate": 3.696510801310632e-05, "loss": 0.82224762, "num_input_tokens_seen": 379036752, "router_z_loss_mlp": 0.0892334, "routerloss_mlp": 0.0, "step": 4577, "time_per_iteration": 2.7370834350585938 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051952, "balance_loss_mlp": 1.04316616, "diversity_loss_mlp": 0.0, "epoch": 0.880723355136591, "flos": 679779330048.0, "grad_norm": 0.06645498049207266, "language_loss": 0.81695998, "learning_rate": 3.6847636697829755e-05, "loss": 0.82747948, "num_input_tokens_seen": 379106480, "router_z_loss_mlp": 0.08795166, "routerloss_mlp": 0.0, "step": 4578, "time_per_iteration": 2.7928130626678467 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105216, "balance_loss_mlp": 1.04327333, "diversity_loss_mlp": 0.0, "epoch": 0.8809157368218545, "flos": 565629723648.0, "grad_norm": 0.06357300740797822, "language_loss": 0.79227793, "learning_rate": 3.673034519424734e-05, "loss": 0.80279958, "num_input_tokens_seen": 379182544, "router_z_loss_mlp": 0.08898926, "routerloss_mlp": 0.0, "step": 4579, "time_per_iteration": 2.7231593132019043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050745, "balance_loss_mlp": 1.04194164, "diversity_loss_mlp": 0.0, "epoch": 0.8811081185071181, "flos": 515407878144.0, "grad_norm": 0.059350650415536, "language_loss": 0.76098466, "learning_rate": 3.661323354789586e-05, "loss": 0.77149218, "num_input_tokens_seen": 379255856, "router_z_loss_mlp": 0.08813477, "routerloss_mlp": 0.0, "step": 4580, "time_per_iteration": 2.683220624923706 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048772, "balance_loss_mlp": 1.03990269, "diversity_loss_mlp": 0.0, "epoch": 0.8813005001923817, "flos": 594343236096.0, "grad_norm": 0.06771926957891432, "language_loss": 0.81324798, "learning_rate": 3.649630180424191e-05, "loss": 0.82373571, "num_input_tokens_seen": 379322704, "router_z_loss_mlp": 0.08880615, "routerloss_mlp": 0.0, "step": 4581, "time_per_iteration": 2.6779592037200928 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050029, "balance_loss_mlp": 1.04133832, "diversity_loss_mlp": 0.0, "epoch": 0.8814928818776453, "flos": 666940843008.0, "grad_norm": 0.07585053291634766, "language_loss": 0.79299724, "learning_rate": 3.637955000868254e-05, "loss": 0.80349755, "num_input_tokens_seen": 379395008, "router_z_loss_mlp": 0.08703613, "routerloss_mlp": 0.0, "step": 4582, "time_per_iteration": 2.831101894378662 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052674, "balance_loss_mlp": 1.04368544, "diversity_loss_mlp": 0.0, "epoch": 0.8816852635629088, "flos": 609153343488.0, "grad_norm": 0.06530916783888785, "language_loss": 0.85757875, "learning_rate": 3.626297820654467e-05, "loss": 0.86810547, "num_input_tokens_seen": 379465824, "router_z_loss_mlp": 0.08990479, "routerloss_mlp": 0.0, "step": 4583, "time_per_iteration": 2.7231874465942383 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050068, "balance_loss_mlp": 1.04128242, "diversity_loss_mlp": 0.0, "epoch": 0.8818776452481724, "flos": 480379009536.0, "grad_norm": 0.07680446741638405, "language_loss": 0.82252479, "learning_rate": 3.614658644308572e-05, "loss": 0.83302546, "num_input_tokens_seen": 379534960, "router_z_loss_mlp": 0.08795166, "routerloss_mlp": 0.0, "step": 4584, "time_per_iteration": 2.6065118312835693 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00796186, "balance_loss_mlp": 1.34451175, "diversity_loss_mlp": 0.22621799, "epoch": 0.882070026933436, "flos": 1045394242560.0, "grad_norm": 0.03516245413492739, "language_loss": 0.73908472, "learning_rate": 3.60303747634928e-05, "loss": 0.74704659, "num_input_tokens_seen": 379617456, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0108207, "step": 4585, "time_per_iteration": 3.3458354473114014 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048695, "balance_loss_mlp": 1.039891, "diversity_loss_mlp": 0.0, "epoch": 0.8822624086186995, "flos": 474409446912.0, "grad_norm": 0.06564674034294884, "language_loss": 0.80001426, "learning_rate": 3.591434321288345e-05, "loss": 0.81050122, "num_input_tokens_seen": 379687792, "router_z_loss_mlp": 0.08807373, "routerloss_mlp": 0.0, "step": 4586, "time_per_iteration": 2.72759747505188 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049471, "balance_loss_mlp": 1.04060817, "diversity_loss_mlp": 0.0, "epoch": 0.882454790303963, "flos": 654023434752.0, "grad_norm": 0.07346558638928435, "language_loss": 0.81996882, "learning_rate": 3.579849183630485e-05, "loss": 0.83046365, "num_input_tokens_seen": 379761120, "router_z_loss_mlp": 0.08862305, "routerloss_mlp": 0.0, "step": 4587, "time_per_iteration": 2.808663845062256 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051315, "balance_loss_mlp": 1.0421896, "diversity_loss_mlp": 0.0, "epoch": 0.8826471719892266, "flos": 470325242880.0, "grad_norm": 0.06304354104337369, "language_loss": 0.78938949, "learning_rate": 3.568282067873468e-05, "loss": 0.79990268, "num_input_tokens_seen": 379829008, "router_z_loss_mlp": 0.09130859, "routerloss_mlp": 0.0, "step": 4588, "time_per_iteration": 2.573918581008911 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047855, "balance_loss_mlp": 1.03888416, "diversity_loss_mlp": 0.0, "epoch": 0.8828395536744902, "flos": 468753744384.0, "grad_norm": 0.061374871286848334, "language_loss": 0.83903325, "learning_rate": 3.556732978508048e-05, "loss": 0.8495118, "num_input_tokens_seen": 379899584, "router_z_loss_mlp": 0.08978271, "routerloss_mlp": 0.0, "step": 4589, "time_per_iteration": 2.6800525188446045 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049011, "balance_loss_mlp": 1.04007053, "diversity_loss_mlp": 0.0, "epoch": 0.8830319353597538, "flos": 721377944064.0, "grad_norm": 0.06744146282588834, "language_loss": 0.81342435, "learning_rate": 3.545201920017971e-05, "loss": 0.82391441, "num_input_tokens_seen": 379979440, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 4590, "time_per_iteration": 2.953735589981079 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052142, "balance_loss_mlp": 1.04338574, "diversity_loss_mlp": 0.0, "epoch": 0.8832243170450174, "flos": 443277204480.0, "grad_norm": 0.07827681611400703, "language_loss": 0.81570184, "learning_rate": 3.5336888968799996e-05, "loss": 0.82622325, "num_input_tokens_seen": 380046944, "router_z_loss_mlp": 0.08770752, "routerloss_mlp": 0.0, "step": 4591, "time_per_iteration": 2.611823081970215 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049471, "balance_loss_mlp": 1.04045248, "diversity_loss_mlp": 0.0, "epoch": 0.8834166987302808, "flos": 566583413760.0, "grad_norm": 0.07488922713809969, "language_loss": 0.82166886, "learning_rate": 3.5221939135638756e-05, "loss": 0.83216357, "num_input_tokens_seen": 380118048, "router_z_loss_mlp": 0.09033203, "routerloss_mlp": 0.0, "step": 4592, "time_per_iteration": 2.820740222930908 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049441, "balance_loss_mlp": 1.04061973, "diversity_loss_mlp": 0.0, "epoch": 0.8836090804155444, "flos": 609316328448.0, "grad_norm": 0.06826234415728213, "language_loss": 0.82207388, "learning_rate": 3.510716974532352e-05, "loss": 0.83256829, "num_input_tokens_seen": 380192416, "router_z_loss_mlp": 0.08837891, "routerloss_mlp": 0.0, "step": 4593, "time_per_iteration": 2.7582898139953613 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048958, "balance_loss_mlp": 1.04009509, "diversity_loss_mlp": 0.0, "epoch": 0.883801462100808, "flos": 557065963008.0, "grad_norm": 0.07322628079560306, "language_loss": 0.80310255, "learning_rate": 3.4992580842411745e-05, "loss": 0.81359208, "num_input_tokens_seen": 380264432, "router_z_loss_mlp": 0.08874512, "routerloss_mlp": 0.0, "step": 4594, "time_per_iteration": 2.7062149047851562 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051053, "balance_loss_mlp": 1.04161763, "diversity_loss_mlp": 0.0, "epoch": 0.8839938437860716, "flos": 516188671488.0, "grad_norm": 0.08697939284189399, "language_loss": 0.77308345, "learning_rate": 3.487817247139064e-05, "loss": 0.78359401, "num_input_tokens_seen": 380334192, "router_z_loss_mlp": 0.09423828, "routerloss_mlp": 0.0, "step": 4595, "time_per_iteration": 2.6008739471435547 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047041, "balance_loss_mlp": 1.03805816, "diversity_loss_mlp": 0.0, "epoch": 0.8841862254713351, "flos": 713696292864.0, "grad_norm": 0.07630739769725799, "language_loss": 0.79033625, "learning_rate": 3.47639446766777e-05, "loss": 0.80080664, "num_input_tokens_seen": 380407504, "router_z_loss_mlp": 0.08990479, "routerloss_mlp": 0.0, "step": 4596, "time_per_iteration": 2.8426897525787354 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048948, "balance_loss_mlp": 1.040079, "diversity_loss_mlp": 0.0, "epoch": 0.8843786071565987, "flos": 833975875584.0, "grad_norm": 0.06236969459816259, "language_loss": 0.82549202, "learning_rate": 3.4649897502620095e-05, "loss": 0.83598149, "num_input_tokens_seen": 380486272, "router_z_loss_mlp": 0.08868408, "routerloss_mlp": 0.0, "step": 4597, "time_per_iteration": 3.0126264095306396 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050555, "balance_loss_mlp": 1.0417217, "diversity_loss_mlp": 0.0, "epoch": 0.8845709888418622, "flos": 656884505088.0, "grad_norm": 0.057498871629657215, "language_loss": 0.82855976, "learning_rate": 3.453603099349462e-05, "loss": 0.83906525, "num_input_tokens_seen": 380568480, "router_z_loss_mlp": 0.08831787, "routerloss_mlp": 0.0, "step": 4598, "time_per_iteration": 2.9096622467041016 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00779413, "balance_loss_mlp": 1.31441939, "diversity_loss_mlp": 0.22293654, "epoch": 0.8847633705271258, "flos": 523326666240.0, "grad_norm": 0.031937649468038294, "language_loss": 0.80943024, "learning_rate": 3.442234519350823e-05, "loss": 0.81722438, "num_input_tokens_seen": 380643088, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01073514, "step": 4599, "time_per_iteration": 2.752638339996338 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049498, "balance_loss_mlp": 1.04064703, "diversity_loss_mlp": 0.0, "epoch": 0.8849557522123894, "flos": 548591035392.0, "grad_norm": 0.06795094778934727, "language_loss": 0.84458822, "learning_rate": 3.430884014679786e-05, "loss": 0.85508323, "num_input_tokens_seen": 380714512, "router_z_loss_mlp": 0.08856201, "routerloss_mlp": 0.0, "step": 4600, "time_per_iteration": 2.663498878479004 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00779393, "balance_loss_mlp": 1.31195164, "diversity_loss_mlp": 0.22577716, "epoch": 0.8851481338976529, "flos": 622372128768.0, "grad_norm": 0.03181593301262544, "language_loss": 0.83776021, "learning_rate": 3.4195515897429974e-05, "loss": 0.84555423, "num_input_tokens_seen": 380789168, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01052869, "step": 4601, "time_per_iteration": 2.7995564937591553 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046945, "balance_loss_mlp": 1.0379926, "diversity_loss_mlp": 0.0, "epoch": 0.8853405155829165, "flos": 444359374848.0, "grad_norm": 0.06356049403382279, "language_loss": 0.80725026, "learning_rate": 3.408237248940088e-05, "loss": 0.8177197, "num_input_tokens_seen": 380856992, "router_z_loss_mlp": 0.08959961, "routerloss_mlp": 0.0, "step": 4602, "time_per_iteration": 2.6017932891845703 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047376, "balance_loss_mlp": 1.03828001, "diversity_loss_mlp": 0.0, "epoch": 0.8855328972681801, "flos": 730470680064.0, "grad_norm": 0.07035000464547823, "language_loss": 0.77883828, "learning_rate": 3.396940996663683e-05, "loss": 0.78931201, "num_input_tokens_seen": 380930480, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 4603, "time_per_iteration": 2.9521942138671875 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046951, "balance_loss_mlp": 1.03792644, "diversity_loss_mlp": 0.0, "epoch": 0.8857252789534437, "flos": 487376414208.0, "grad_norm": 0.06898692389267871, "language_loss": 0.78990823, "learning_rate": 3.385662837299375e-05, "loss": 0.80037773, "num_input_tokens_seen": 380994192, "router_z_loss_mlp": 0.09033203, "routerloss_mlp": 0.0, "step": 4604, "time_per_iteration": 2.5854694843292236 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047713, "balance_loss_mlp": 1.03895068, "diversity_loss_mlp": 0.0, "epoch": 0.8859176606387072, "flos": 508556206080.0, "grad_norm": 0.06638743776056398, "language_loss": 0.81713545, "learning_rate": 3.374402775225727e-05, "loss": 0.82761252, "num_input_tokens_seen": 381066848, "router_z_loss_mlp": 0.08764648, "routerloss_mlp": 0.0, "step": 4605, "time_per_iteration": 2.692868232727051 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045834, "balance_loss_mlp": 1.03658962, "diversity_loss_mlp": 0.0, "epoch": 0.8861100423239707, "flos": 516628440576.0, "grad_norm": 0.06624513803881459, "language_loss": 0.85526776, "learning_rate": 3.3631608148142925e-05, "loss": 0.86572611, "num_input_tokens_seen": 381138816, "router_z_loss_mlp": 0.09240723, "routerloss_mlp": 0.0, "step": 4606, "time_per_iteration": 2.6592142581939697 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00790369, "balance_loss_mlp": 1.33229494, "diversity_loss_mlp": 0.22699621, "epoch": 0.8863024240092343, "flos": 626975396352.0, "grad_norm": 0.03136786172758775, "language_loss": 0.79641789, "learning_rate": 3.3519369604295746e-05, "loss": 0.80432159, "num_input_tokens_seen": 381208448, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01072356, "step": 4607, "time_per_iteration": 2.7557034492492676 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048991, "balance_loss_mlp": 1.03997266, "diversity_loss_mlp": 0.0, "epoch": 0.8864948056944979, "flos": 766910260224.0, "grad_norm": 0.053068589539523224, "language_loss": 0.83634484, "learning_rate": 3.340731216429083e-05, "loss": 0.84683472, "num_input_tokens_seen": 381289712, "router_z_loss_mlp": 0.090271, "routerloss_mlp": 0.0, "step": 4608, "time_per_iteration": 2.970646381378174 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01013538, "balance_loss_mlp": 1.00912714, "diversity_loss_mlp": 0.0, "epoch": 0.8866871873797615, "flos": 1502331452928.0, "grad_norm": 0.013952158084226052, "language_loss": 0.78830957, "learning_rate": 3.329543587163253e-05, "loss": 0.79844493, "num_input_tokens_seen": 381520848, "router_z_loss_mlp": 0.04418945, "routerloss_mlp": 0.0, "step": 4609, "time_per_iteration": 4.800167798995972 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046127, "balance_loss_mlp": 1.03707361, "diversity_loss_mlp": 0.0, "epoch": 0.886879569065025, "flos": 811516050432.0, "grad_norm": 0.06983974762090492, "language_loss": 0.82014269, "learning_rate": 3.3183740769755e-05, "loss": 0.83060396, "num_input_tokens_seen": 381603008, "router_z_loss_mlp": 0.09057617, "routerloss_mlp": 0.0, "step": 4610, "time_per_iteration": 3.0428099632263184 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01013271, "balance_loss_mlp": 1.00885999, "diversity_loss_mlp": 0.0, "epoch": 0.8870719507502886, "flos": 1582838309376.0, "grad_norm": 0.013954976330346456, "language_loss": 0.7691083, "learning_rate": 3.307222690202238e-05, "loss": 0.77924109, "num_input_tokens_seen": 381844336, "router_z_loss_mlp": 0.04418945, "routerloss_mlp": 0.0, "step": 4611, "time_per_iteration": 4.960276126861572 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048945, "balance_loss_mlp": 1.04021323, "diversity_loss_mlp": 0.0, "epoch": 0.8872643324355521, "flos": 634027129344.0, "grad_norm": 0.06747784662244205, "language_loss": 0.75143421, "learning_rate": 3.296089431172811e-05, "loss": 0.76192367, "num_input_tokens_seen": 381918576, "router_z_loss_mlp": 0.08734131, "routerloss_mlp": 0.0, "step": 4612, "time_per_iteration": 2.8096370697021484 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046218, "balance_loss_mlp": 1.03731275, "diversity_loss_mlp": 0.0, "epoch": 0.8874567141208157, "flos": 535755119616.0, "grad_norm": 0.081523690910391, "language_loss": 0.83038783, "learning_rate": 3.284974304209532e-05, "loss": 0.84084994, "num_input_tokens_seen": 381987296, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 4613, "time_per_iteration": 2.6296303272247314 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047961, "balance_loss_mlp": 1.0389961, "diversity_loss_mlp": 0.0, "epoch": 0.8876490958060793, "flos": 1566302552064.0, "grad_norm": 0.07384350898299535, "language_loss": 0.79394948, "learning_rate": 3.27387731362766e-05, "loss": 0.80442905, "num_input_tokens_seen": 382091744, "router_z_loss_mlp": 0.08972168, "routerloss_mlp": 0.0, "step": 4614, "time_per_iteration": 3.9052226543426514 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045945, "balance_loss_mlp": 1.0370816, "diversity_loss_mlp": 0.0, "epoch": 0.8878414774913428, "flos": 636633810432.0, "grad_norm": 0.06075632435028376, "language_loss": 0.84765017, "learning_rate": 3.2627984637354444e-05, "loss": 0.85810959, "num_input_tokens_seen": 382169600, "router_z_loss_mlp": 0.08874512, "routerloss_mlp": 0.0, "step": 4615, "time_per_iteration": 2.784306764602661 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049902, "balance_loss_mlp": 1.04100347, "diversity_loss_mlp": 0.0, "epoch": 0.8880338591766064, "flos": 496429502976.0, "grad_norm": 0.07661340087165963, "language_loss": 0.81347793, "learning_rate": 3.251737758834084e-05, "loss": 0.82397699, "num_input_tokens_seen": 382238336, "router_z_loss_mlp": 0.08905029, "routerloss_mlp": 0.0, "step": 4616, "time_per_iteration": 2.585916042327881 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00779874, "balance_loss_mlp": 1.31519485, "diversity_loss_mlp": 0.22310758, "epoch": 0.88822624086187, "flos": 542861180928.0, "grad_norm": 0.03294259540614503, "language_loss": 0.79988885, "learning_rate": 3.2406952032177086e-05, "loss": 0.80768752, "num_input_tokens_seen": 382308560, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01072259, "step": 4617, "time_per_iteration": 2.658268928527832 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044094, "balance_loss_mlp": 1.03512335, "diversity_loss_mlp": 0.0, "epoch": 0.8884186225471336, "flos": 551822865408.0, "grad_norm": 0.08219678758811591, "language_loss": 0.83779407, "learning_rate": 3.229670801173418e-05, "loss": 0.84823501, "num_input_tokens_seen": 382377504, "router_z_loss_mlp": 0.08978271, "routerloss_mlp": 0.0, "step": 4618, "time_per_iteration": 2.6499626636505127 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01013119, "balance_loss_mlp": 1.00873196, "diversity_loss_mlp": 0.0, "epoch": 0.888611004232397, "flos": 1565263305216.0, "grad_norm": 0.01269771212796008, "language_loss": 0.78512192, "learning_rate": 3.218664556981288e-05, "loss": 0.79525316, "num_input_tokens_seen": 382615728, "router_z_loss_mlp": 0.04394531, "routerloss_mlp": 0.0, "step": 4619, "time_per_iteration": 5.039214134216309 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048007, "balance_loss_mlp": 1.03929269, "diversity_loss_mlp": 0.0, "epoch": 0.8888033859176606, "flos": 767028828672.0, "grad_norm": 0.06229683334708209, "language_loss": 0.82604653, "learning_rate": 3.207676474914301e-05, "loss": 0.83652663, "num_input_tokens_seen": 382695552, "router_z_loss_mlp": 0.0871582, "routerloss_mlp": 0.0, "step": 4620, "time_per_iteration": 2.987938404083252 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044784, "balance_loss_mlp": 1.03616548, "diversity_loss_mlp": 0.0, "epoch": 0.8889957676029242, "flos": 934110849024.0, "grad_norm": 0.0772642935579886, "language_loss": 0.8405602, "learning_rate": 3.1967065592384105e-05, "loss": 0.851008, "num_input_tokens_seen": 382775824, "router_z_loss_mlp": 0.08630371, "routerloss_mlp": 0.0, "step": 4621, "time_per_iteration": 3.1390573978424072 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050592, "balance_loss_mlp": 1.04172254, "diversity_loss_mlp": 0.0, "epoch": 0.8891881492881878, "flos": 589611488256.0, "grad_norm": 0.06838136238403997, "language_loss": 0.81778359, "learning_rate": 3.1857548142125104e-05, "loss": 0.82828951, "num_input_tokens_seen": 382854464, "router_z_loss_mlp": 0.08874512, "routerloss_mlp": 0.0, "step": 4622, "time_per_iteration": 2.799467086791992 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047387, "balance_loss_mlp": 1.03847051, "diversity_loss_mlp": 0.0, "epoch": 0.8893805309734514, "flos": 540718861824.0, "grad_norm": 0.0659043400927782, "language_loss": 0.82619703, "learning_rate": 3.174821244088466e-05, "loss": 0.83667088, "num_input_tokens_seen": 382925088, "router_z_loss_mlp": 0.08917236, "routerloss_mlp": 0.0, "step": 4623, "time_per_iteration": 2.7340970039367676 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046712, "balance_loss_mlp": 1.0377115, "diversity_loss_mlp": 0.0, "epoch": 0.8895729126587149, "flos": 560095160832.0, "grad_norm": 0.06558378954602251, "language_loss": 0.81849378, "learning_rate": 3.163905853111054e-05, "loss": 0.8289609, "num_input_tokens_seen": 382998640, "router_z_loss_mlp": 0.09008789, "routerloss_mlp": 0.0, "step": 4624, "time_per_iteration": 2.6568024158477783 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047965, "balance_loss_mlp": 1.03908443, "diversity_loss_mlp": 0.0, "epoch": 0.8897652943439784, "flos": 610154021376.0, "grad_norm": 0.060975907763050036, "language_loss": 0.81057096, "learning_rate": 3.153008645517996e-05, "loss": 0.82105064, "num_input_tokens_seen": 383076000, "router_z_loss_mlp": 0.08886719, "routerloss_mlp": 0.0, "step": 4625, "time_per_iteration": 2.7340495586395264 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044582, "balance_loss_mlp": 1.03537273, "diversity_loss_mlp": 0.0, "epoch": 0.889957676029242, "flos": 917847811584.0, "grad_norm": 0.07485889575749058, "language_loss": 0.770868, "learning_rate": 3.142129625539969e-05, "loss": 0.78131384, "num_input_tokens_seen": 383166640, "router_z_loss_mlp": 0.09210205, "routerloss_mlp": 0.0, "step": 4626, "time_per_iteration": 3.187793016433716 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051033, "balance_loss_mlp": 1.04202616, "diversity_loss_mlp": 0.0, "epoch": 0.8901500577145056, "flos": 488698292736.0, "grad_norm": 0.08455877289506715, "language_loss": 0.8016057, "learning_rate": 3.131268797400588e-05, "loss": 0.81211603, "num_input_tokens_seen": 383232928, "router_z_loss_mlp": 0.09008789, "routerloss_mlp": 0.0, "step": 4627, "time_per_iteration": 2.5675413608551025 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104463, "balance_loss_mlp": 1.03559375, "diversity_loss_mlp": 0.0, "epoch": 0.8903424393997691, "flos": 733648181760.0, "grad_norm": 0.06293120132110656, "language_loss": 0.80719471, "learning_rate": 3.120426165316398e-05, "loss": 0.81764102, "num_input_tokens_seen": 383314352, "router_z_loss_mlp": 0.09039307, "routerloss_mlp": 0.0, "step": 4628, "time_per_iteration": 2.9961817264556885 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044841, "balance_loss_mlp": 1.03616869, "diversity_loss_mlp": 0.0, "epoch": 0.8905348210850327, "flos": 519813282816.0, "grad_norm": 0.08203467156217556, "language_loss": 0.81727576, "learning_rate": 3.109601733496881e-05, "loss": 0.82772422, "num_input_tokens_seen": 383384848, "router_z_loss_mlp": 0.08685303, "routerloss_mlp": 0.0, "step": 4629, "time_per_iteration": 2.679408073425293 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01042396, "balance_loss_mlp": 1.03355646, "diversity_loss_mlp": 0.0, "epoch": 0.8907272027702963, "flos": 578976989184.0, "grad_norm": 0.06898009343071365, "language_loss": 0.79810011, "learning_rate": 3.098795506144458e-05, "loss": 0.80852401, "num_input_tokens_seen": 383463360, "router_z_loss_mlp": 0.08843994, "routerloss_mlp": 0.0, "step": 4630, "time_per_iteration": 2.83233380317688 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01041898, "balance_loss_mlp": 1.03328514, "diversity_loss_mlp": 0.0, "epoch": 0.8909195844555599, "flos": 893628910080.0, "grad_norm": 0.0715777029832187, "language_loss": 0.7953496, "learning_rate": 3.088007487454475e-05, "loss": 0.80576855, "num_input_tokens_seen": 383542080, "router_z_loss_mlp": 0.08618164, "routerloss_mlp": 0.0, "step": 4631, "time_per_iteration": 3.12410569190979 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0077771, "balance_loss_mlp": 1.31088805, "diversity_loss_mlp": 0.22250512, "epoch": 0.8911119661408234, "flos": 549865926144.0, "grad_norm": 0.032192261312759214, "language_loss": 0.84286821, "learning_rate": 3.077237681615208e-05, "loss": 0.8506453, "num_input_tokens_seen": 383613056, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01101306, "step": 4632, "time_per_iteration": 2.703425884246826 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049311, "balance_loss_mlp": 1.04004884, "diversity_loss_mlp": 0.0, "epoch": 0.8913043478260869, "flos": 481139979264.0, "grad_norm": 0.08188608007058847, "language_loss": 0.84165525, "learning_rate": 3.066486092807874e-05, "loss": 0.85214841, "num_input_tokens_seen": 383683280, "router_z_loss_mlp": 0.09259033, "routerloss_mlp": 0.0, "step": 4633, "time_per_iteration": 2.712557554244995 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047634, "balance_loss_mlp": 1.03861618, "diversity_loss_mlp": 0.0, "epoch": 0.8914967295113505, "flos": 484581782016.0, "grad_norm": 0.06060123366569166, "language_loss": 0.85206622, "learning_rate": 3.055752725206601e-05, "loss": 0.86254251, "num_input_tokens_seen": 383754624, "router_z_loss_mlp": 0.09020996, "routerloss_mlp": 0.0, "step": 4634, "time_per_iteration": 2.630039691925049 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01042062, "balance_loss_mlp": 1.03316331, "diversity_loss_mlp": 0.0, "epoch": 0.8916891111966141, "flos": 445664001024.0, "grad_norm": 0.06527746139553993, "language_loss": 0.8135035, "learning_rate": 3.0450375829784714e-05, "loss": 0.82392418, "num_input_tokens_seen": 383821984, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 4635, "time_per_iteration": 2.5558903217315674 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047554, "balance_loss_mlp": 1.03875005, "diversity_loss_mlp": 0.0, "epoch": 0.8918814928818777, "flos": 564016379904.0, "grad_norm": 0.06346729793174329, "language_loss": 0.78307879, "learning_rate": 3.034340670283453e-05, "loss": 0.79355425, "num_input_tokens_seen": 383890880, "router_z_loss_mlp": 0.08807373, "routerloss_mlp": 0.0, "step": 4636, "time_per_iteration": 2.7006030082702637 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045904, "balance_loss_mlp": 1.03714168, "diversity_loss_mlp": 0.0, "epoch": 0.8920738745671412, "flos": 575943022080.0, "grad_norm": 0.06783278448064689, "language_loss": 0.8109082, "learning_rate": 3.0236619912744513e-05, "loss": 0.82136714, "num_input_tokens_seen": 383962480, "router_z_loss_mlp": 0.08770752, "routerloss_mlp": 0.0, "step": 4637, "time_per_iteration": 2.6627137660980225 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043668, "balance_loss_mlp": 1.03518057, "diversity_loss_mlp": 0.0, "epoch": 0.8922662562524047, "flos": 620180623872.0, "grad_norm": 0.06701291241567459, "language_loss": 0.84168345, "learning_rate": 3.0130015500973163e-05, "loss": 0.85212016, "num_input_tokens_seen": 384033616, "router_z_loss_mlp": 0.08496094, "routerloss_mlp": 0.0, "step": 4638, "time_per_iteration": 2.7190563678741455 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048979, "balance_loss_mlp": 1.04025865, "diversity_loss_mlp": 0.0, "epoch": 0.8924586379376683, "flos": 583624673280.0, "grad_norm": 0.06480897369874776, "language_loss": 0.79137188, "learning_rate": 3.0023593508907877e-05, "loss": 0.80186164, "num_input_tokens_seen": 384108848, "router_z_loss_mlp": 0.08728027, "routerloss_mlp": 0.0, "step": 4639, "time_per_iteration": 2.7548539638519287 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046259, "balance_loss_mlp": 1.03746128, "diversity_loss_mlp": 0.0, "epoch": 0.8926510196229319, "flos": 525177520128.0, "grad_norm": 0.06545758779491198, "language_loss": 0.81798422, "learning_rate": 2.991735397786538e-05, "loss": 0.82844687, "num_input_tokens_seen": 384185728, "router_z_loss_mlp": 0.08807373, "routerloss_mlp": 0.0, "step": 4640, "time_per_iteration": 2.7450599670410156 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046018, "balance_loss_mlp": 1.03710771, "diversity_loss_mlp": 0.0, "epoch": 0.8928434013081955, "flos": 486669772800.0, "grad_norm": 0.07321859189533414, "language_loss": 0.80895549, "learning_rate": 2.981129694909146e-05, "loss": 0.81941569, "num_input_tokens_seen": 384251552, "router_z_loss_mlp": 0.08917236, "routerloss_mlp": 0.0, "step": 4641, "time_per_iteration": 2.5623698234558105 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01003551, "balance_loss_mlp": 0.99911606, "diversity_loss_mlp": 0.0, "epoch": 0.893035782993459, "flos": 1448302560768.0, "grad_norm": 0.005611533508350328, "language_loss": 0.80330861, "learning_rate": 2.970542246376118e-05, "loss": 0.81334412, "num_input_tokens_seen": 384472176, "router_z_loss_mlp": 0.04443359, "routerloss_mlp": 0.0, "step": 4642, "time_per_iteration": 4.691712379455566 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047132, "balance_loss_mlp": 1.03812027, "diversity_loss_mlp": 0.0, "epoch": 0.8932281646787226, "flos": 611320255488.0, "grad_norm": 0.0756626581840296, "language_loss": 0.8056438, "learning_rate": 2.95997305629786e-05, "loss": 0.8161152, "num_input_tokens_seen": 384544224, "router_z_loss_mlp": 0.09020996, "routerloss_mlp": 0.0, "step": 4643, "time_per_iteration": 2.774066925048828 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048848, "balance_loss_mlp": 1.03975809, "diversity_loss_mlp": 0.0, "epoch": 0.8934205463639862, "flos": 565760775168.0, "grad_norm": 0.07062905944842346, "language_loss": 0.84894288, "learning_rate": 2.9494221287776957e-05, "loss": 0.85943139, "num_input_tokens_seen": 384611728, "router_z_loss_mlp": 0.09088135, "routerloss_mlp": 0.0, "step": 4644, "time_per_iteration": 2.6488940715789795 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048016, "balance_loss_mlp": 1.03935552, "diversity_loss_mlp": 0.0, "epoch": 0.8936129280492497, "flos": 488431420416.0, "grad_norm": 0.0836667751857819, "language_loss": 0.78037202, "learning_rate": 2.9388894679118484e-05, "loss": 0.79085219, "num_input_tokens_seen": 384678048, "router_z_loss_mlp": 0.08673096, "routerloss_mlp": 0.0, "step": 4645, "time_per_iteration": 2.583796977996826 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049053, "balance_loss_mlp": 1.04036856, "diversity_loss_mlp": 0.0, "epoch": 0.8938053097345132, "flos": 886490542080.0, "grad_norm": 0.05897365940384636, "language_loss": 0.807109, "learning_rate": 2.9283750777894912e-05, "loss": 0.81759953, "num_input_tokens_seen": 384766768, "router_z_loss_mlp": 0.08691406, "routerloss_mlp": 0.0, "step": 4646, "time_per_iteration": 3.2107162475585938 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045739, "balance_loss_mlp": 1.03690004, "diversity_loss_mlp": 0.0, "epoch": 0.8939976914197768, "flos": 593285658624.0, "grad_norm": 0.06566650575637094, "language_loss": 0.8383972, "learning_rate": 2.9178789624926427e-05, "loss": 0.8488546, "num_input_tokens_seen": 384842352, "router_z_loss_mlp": 0.08850098, "routerloss_mlp": 0.0, "step": 4647, "time_per_iteration": 2.742075204849243 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050364, "balance_loss_mlp": 1.04111314, "diversity_loss_mlp": 0.0, "epoch": 0.8941900731050404, "flos": 523247745024.0, "grad_norm": 0.07362813813067959, "language_loss": 0.81445944, "learning_rate": 2.9074011260962706e-05, "loss": 0.82496303, "num_input_tokens_seen": 384912048, "router_z_loss_mlp": 0.09259033, "routerloss_mlp": 0.0, "step": 4648, "time_per_iteration": 2.664386510848999 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044009, "balance_loss_mlp": 1.03510404, "diversity_loss_mlp": 0.0, "epoch": 0.894382454790304, "flos": 800582745600.0, "grad_norm": 0.06107370863093702, "language_loss": 0.80719924, "learning_rate": 2.8969415726682158e-05, "loss": 0.81763935, "num_input_tokens_seen": 384986560, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 4649, "time_per_iteration": 2.9920804500579834 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047105, "balance_loss_mlp": 1.03825331, "diversity_loss_mlp": 0.0, "epoch": 0.8945748364755676, "flos": 479037307392.0, "grad_norm": 0.06165388839592064, "language_loss": 0.84649253, "learning_rate": 2.8865003062692517e-05, "loss": 0.85696357, "num_input_tokens_seen": 385057376, "router_z_loss_mlp": 0.08856201, "routerloss_mlp": 0.0, "step": 4650, "time_per_iteration": 2.6212713718414307 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046466, "balance_loss_mlp": 1.03758526, "diversity_loss_mlp": 0.0, "epoch": 0.894767218160831, "flos": 508776090624.0, "grad_norm": 0.06579934808698863, "language_loss": 0.83054405, "learning_rate": 2.876077330953042e-05, "loss": 0.84100872, "num_input_tokens_seen": 385130880, "router_z_loss_mlp": 0.08886719, "routerloss_mlp": 0.0, "step": 4651, "time_per_iteration": 2.671393394470215 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045401, "balance_loss_mlp": 1.03632951, "diversity_loss_mlp": 0.0, "epoch": 0.8949595998460946, "flos": 685857549312.0, "grad_norm": 0.06478595695479929, "language_loss": 0.81956565, "learning_rate": 2.8656726507661378e-05, "loss": 0.83001965, "num_input_tokens_seen": 385205808, "router_z_loss_mlp": 0.09063721, "routerloss_mlp": 0.0, "step": 4652, "time_per_iteration": 2.849560499191284 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045732, "balance_loss_mlp": 1.03662467, "diversity_loss_mlp": 0.0, "epoch": 0.8951519815313582, "flos": 799920520704.0, "grad_norm": 0.06805126112229812, "language_loss": 0.7762472, "learning_rate": 2.855286269747981e-05, "loss": 0.78670454, "num_input_tokens_seen": 385283616, "router_z_loss_mlp": 0.09112549, "routerloss_mlp": 0.0, "step": 4653, "time_per_iteration": 2.9823384284973145 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010449, "balance_loss_mlp": 1.03572643, "diversity_loss_mlp": 0.0, "epoch": 0.8953443632166218, "flos": 666740782080.0, "grad_norm": 0.06521391394645211, "language_loss": 0.86080307, "learning_rate": 2.8449181919309398e-05, "loss": 0.87125206, "num_input_tokens_seen": 385357488, "router_z_loss_mlp": 0.09179688, "routerloss_mlp": 0.0, "step": 4654, "time_per_iteration": 2.7805397510528564 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048835, "balance_loss_mlp": 1.03972173, "diversity_loss_mlp": 0.0, "epoch": 0.8955367449018854, "flos": 644977686528.0, "grad_norm": 0.0849204409565989, "language_loss": 0.83320463, "learning_rate": 2.8345684213402556e-05, "loss": 0.84369302, "num_input_tokens_seen": 385431280, "router_z_loss_mlp": 0.09118652, "routerloss_mlp": 0.0, "step": 4655, "time_per_iteration": 2.876401662826538 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00780551, "balance_loss_mlp": 1.31460428, "diversity_loss_mlp": 0.22509943, "epoch": 0.8957291265871489, "flos": 808714077696.0, "grad_norm": 0.034355787829583595, "language_loss": 0.77789617, "learning_rate": 2.8242369619940644e-05, "loss": 0.78570163, "num_input_tokens_seen": 385509840, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0106987, "step": 4656, "time_per_iteration": 3.0823395252227783 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104553, "balance_loss_mlp": 1.03641081, "diversity_loss_mlp": 0.0, "epoch": 0.8959215082724125, "flos": 518923832832.0, "grad_norm": 0.0676440423058397, "language_loss": 0.77287573, "learning_rate": 2.813923817903391e-05, "loss": 0.78333104, "num_input_tokens_seen": 385580384, "router_z_loss_mlp": 0.09118652, "routerloss_mlp": 0.0, "step": 4657, "time_per_iteration": 2.64528751373291 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048024, "balance_loss_mlp": 1.03879762, "diversity_loss_mlp": 0.0, "epoch": 0.896113889957676, "flos": 476917383168.0, "grad_norm": 0.0693704945431175, "language_loss": 0.77242142, "learning_rate": 2.8036289930721603e-05, "loss": 0.78290164, "num_input_tokens_seen": 385649184, "router_z_loss_mlp": 0.09228516, "routerloss_mlp": 0.0, "step": 4658, "time_per_iteration": 2.6108851432800293 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047944, "balance_loss_mlp": 1.03874731, "diversity_loss_mlp": 0.0, "epoch": 0.8963062716429396, "flos": 518162863104.0, "grad_norm": 0.0647769416450041, "language_loss": 0.83169466, "learning_rate": 2.7933524914971697e-05, "loss": 0.84217411, "num_input_tokens_seen": 385717072, "router_z_loss_mlp": 0.09191895, "routerloss_mlp": 0.0, "step": 4659, "time_per_iteration": 2.605060338973999 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00791335, "balance_loss_mlp": 1.33468997, "diversity_loss_mlp": 0.22667646, "epoch": 0.8964986533282031, "flos": 508484625408.0, "grad_norm": 0.035487365759697125, "language_loss": 0.82103157, "learning_rate": 2.7830943171681113e-05, "loss": 0.82894492, "num_input_tokens_seen": 385788880, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01065169, "step": 4660, "time_per_iteration": 2.7054848670959473 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045343, "balance_loss_mlp": 1.0363133, "diversity_loss_mlp": 0.0, "epoch": 0.8966910350134667, "flos": 536076320256.0, "grad_norm": 0.08335321412533339, "language_loss": 0.81629348, "learning_rate": 2.77285447406756e-05, "loss": 0.82674694, "num_input_tokens_seen": 385854240, "router_z_loss_mlp": 0.09033203, "routerloss_mlp": 0.0, "step": 4661, "time_per_iteration": 2.6878600120544434 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051819, "balance_loss_mlp": 1.04299188, "diversity_loss_mlp": 0.0, "epoch": 0.8968834166987303, "flos": 723226226688.0, "grad_norm": 0.08023362288618259, "language_loss": 0.84117174, "learning_rate": 2.7626329661709914e-05, "loss": 0.85168993, "num_input_tokens_seen": 385926080, "router_z_loss_mlp": 0.08831787, "routerloss_mlp": 0.0, "step": 4662, "time_per_iteration": 2.9065072536468506 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00783028, "balance_loss_mlp": 1.31757593, "diversity_loss_mlp": 0.22707665, "epoch": 0.8970757983839939, "flos": 681686710272.0, "grad_norm": 0.028939334122514253, "language_loss": 0.84291148, "learning_rate": 2.7524297974467372e-05, "loss": 0.85074168, "num_input_tokens_seen": 386005696, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01070135, "step": 4663, "time_per_iteration": 2.978598117828369 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044806, "balance_loss_mlp": 1.03588283, "diversity_loss_mlp": 0.0, "epoch": 0.8972681800692575, "flos": 613037486592.0, "grad_norm": 0.09868574536780622, "language_loss": 0.75424099, "learning_rate": 2.742244971856006e-05, "loss": 0.76468909, "num_input_tokens_seen": 386073248, "router_z_loss_mlp": 0.08935547, "routerloss_mlp": 0.0, "step": 4664, "time_per_iteration": 2.7175958156585693 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104879, "balance_loss_mlp": 1.03972983, "diversity_loss_mlp": 0.0, "epoch": 0.8974605617545209, "flos": 572350344192.0, "grad_norm": 0.07019842465420709, "language_loss": 0.83128035, "learning_rate": 2.732078493352913e-05, "loss": 0.84176832, "num_input_tokens_seen": 386148528, "router_z_loss_mlp": 0.09057617, "routerloss_mlp": 0.0, "step": 4665, "time_per_iteration": 2.7153587341308594 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104645, "balance_loss_mlp": 1.03744328, "diversity_loss_mlp": 0.0, "epoch": 0.8976529434397845, "flos": 520418608128.0, "grad_norm": 0.06031238876791543, "language_loss": 0.87254226, "learning_rate": 2.721930365884434e-05, "loss": 0.88300675, "num_input_tokens_seen": 386218528, "router_z_loss_mlp": 0.09008789, "routerloss_mlp": 0.0, "step": 4666, "time_per_iteration": 2.6804378032684326 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047772, "balance_loss_mlp": 1.03897464, "diversity_loss_mlp": 0.0, "epoch": 0.8978453251250481, "flos": 471355656192.0, "grad_norm": 0.05793843844833838, "language_loss": 0.82573009, "learning_rate": 2.7118005933904176e-05, "loss": 0.83620781, "num_input_tokens_seen": 386284704, "router_z_loss_mlp": 0.08807373, "routerloss_mlp": 0.0, "step": 4667, "time_per_iteration": 2.6166820526123047 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047529, "balance_loss_mlp": 1.03860664, "diversity_loss_mlp": 0.0, "epoch": 0.8980377068103117, "flos": 591659831808.0, "grad_norm": 0.057031250426829085, "language_loss": 0.82203746, "learning_rate": 2.7016891798035904e-05, "loss": 0.8325128, "num_input_tokens_seen": 386356128, "router_z_loss_mlp": 0.0892334, "routerloss_mlp": 0.0, "step": 4668, "time_per_iteration": 2.7726669311523438 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105099, "balance_loss_mlp": 1.04209065, "diversity_loss_mlp": 0.0, "epoch": 0.8982300884955752, "flos": 767619472896.0, "grad_norm": 0.07157029094935193, "language_loss": 0.82771599, "learning_rate": 2.691596129049556e-05, "loss": 0.83822584, "num_input_tokens_seen": 386434048, "router_z_loss_mlp": 0.08898926, "routerloss_mlp": 0.0, "step": 4669, "time_per_iteration": 2.934701681137085 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050412, "balance_loss_mlp": 1.04158425, "diversity_loss_mlp": 0.0, "epoch": 0.8984224701808388, "flos": 844575496704.0, "grad_norm": 0.07594625881413491, "language_loss": 0.77720773, "learning_rate": 2.681521445046775e-05, "loss": 0.78771186, "num_input_tokens_seen": 386532384, "router_z_loss_mlp": 0.08831787, "routerloss_mlp": 0.0, "step": 4670, "time_per_iteration": 3.232701539993286 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050559, "balance_loss_mlp": 1.04171383, "diversity_loss_mlp": 0.0, "epoch": 0.8986148518661023, "flos": 757661879808.0, "grad_norm": 0.07298208517048191, "language_loss": 0.75987267, "learning_rate": 2.6714651317065963e-05, "loss": 0.77037835, "num_input_tokens_seen": 386627120, "router_z_loss_mlp": 0.08856201, "routerloss_mlp": 0.0, "step": 4671, "time_per_iteration": 3.183443784713745 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043023, "balance_loss_mlp": 1.03399336, "diversity_loss_mlp": 0.0, "epoch": 0.8988072335513659, "flos": 563070030336.0, "grad_norm": 0.0671693421720064, "language_loss": 0.76635265, "learning_rate": 2.6614271929332133e-05, "loss": 0.77678287, "num_input_tokens_seen": 386700192, "router_z_loss_mlp": 0.09033203, "routerloss_mlp": 0.0, "step": 4672, "time_per_iteration": 2.657771587371826 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047693, "balance_loss_mlp": 1.03888941, "diversity_loss_mlp": 0.0, "epoch": 0.8989996152366295, "flos": 492683751936.0, "grad_norm": 0.07004510948289375, "language_loss": 0.86707628, "learning_rate": 2.6514076326237147e-05, "loss": 0.87755322, "num_input_tokens_seen": 386764256, "router_z_loss_mlp": 0.0880127, "routerloss_mlp": 0.0, "step": 4673, "time_per_iteration": 2.535236120223999 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047542, "balance_loss_mlp": 1.03844619, "diversity_loss_mlp": 0.0, "epoch": 0.899191996921893, "flos": 542567144448.0, "grad_norm": 0.07892824616979383, "language_loss": 0.75965667, "learning_rate": 2.6414064546680438e-05, "loss": 0.77013206, "num_input_tokens_seen": 386835792, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 4674, "time_per_iteration": 2.6591787338256836 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048875, "balance_loss_mlp": 1.03979182, "diversity_loss_mlp": 0.0, "epoch": 0.8993843786071566, "flos": 471325920768.0, "grad_norm": 0.06983733159730086, "language_loss": 0.80178928, "learning_rate": 2.631423662948984e-05, "loss": 0.81227803, "num_input_tokens_seen": 386904368, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 4675, "time_per_iteration": 2.5485310554504395 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048014, "balance_loss_mlp": 1.03897238, "diversity_loss_mlp": 0.0, "epoch": 0.8995767602924202, "flos": 526726623744.0, "grad_norm": 0.07663293464144452, "language_loss": 0.82886845, "learning_rate": 2.621459261342196e-05, "loss": 0.83934855, "num_input_tokens_seen": 386977872, "router_z_loss_mlp": 0.0904541, "routerloss_mlp": 0.0, "step": 4676, "time_per_iteration": 2.712852954864502 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047902, "balance_loss_mlp": 1.03895569, "diversity_loss_mlp": 0.0, "epoch": 0.8997691419776838, "flos": 557634212352.0, "grad_norm": 0.063501986784752, "language_loss": 0.8503803, "learning_rate": 2.6115132537162245e-05, "loss": 0.86085933, "num_input_tokens_seen": 387052080, "router_z_loss_mlp": 0.08953857, "routerloss_mlp": 0.0, "step": 4677, "time_per_iteration": 2.700191020965576 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049601, "balance_loss_mlp": 1.04097605, "diversity_loss_mlp": 0.0, "epoch": 0.8999615236629472, "flos": 639027947520.0, "grad_norm": 0.06651601339856017, "language_loss": 0.80581087, "learning_rate": 2.601585643932436e-05, "loss": 0.81630689, "num_input_tokens_seen": 387129712, "router_z_loss_mlp": 0.08636475, "routerloss_mlp": 0.0, "step": 4678, "time_per_iteration": 2.815133571624756 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01004703, "balance_loss_mlp": 1.00024414, "diversity_loss_mlp": 0.0, "epoch": 0.9001539053482108, "flos": 1431510547968.0, "grad_norm": 0.0032341066943480366, "language_loss": 0.85784018, "learning_rate": 2.5916764358450862e-05, "loss": 0.86788726, "num_input_tokens_seen": 387356560, "router_z_loss_mlp": 0.04467773, "routerloss_mlp": 0.0, "step": 4679, "time_per_iteration": 4.805148124694824 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051809, "balance_loss_mlp": 1.0428679, "diversity_loss_mlp": 0.0, "epoch": 0.9003462870334744, "flos": 566877450240.0, "grad_norm": 0.07566932247626351, "language_loss": 0.79916567, "learning_rate": 2.5817856333012425e-05, "loss": 0.8096838, "num_input_tokens_seen": 387438640, "router_z_loss_mlp": 0.08947754, "routerloss_mlp": 0.0, "step": 4680, "time_per_iteration": 2.844775915145874 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046896, "balance_loss_mlp": 1.03798509, "diversity_loss_mlp": 0.0, "epoch": 0.900538668718738, "flos": 538655837184.0, "grad_norm": 0.06791957432772232, "language_loss": 0.78502154, "learning_rate": 2.5719132401408883e-05, "loss": 0.7954905, "num_input_tokens_seen": 387507088, "router_z_loss_mlp": 0.0892334, "routerloss_mlp": 0.0, "step": 4681, "time_per_iteration": 2.6303482055664062 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045024, "balance_loss_mlp": 1.03633404, "diversity_loss_mlp": 0.0, "epoch": 0.9007310504040016, "flos": 488387003904.0, "grad_norm": 0.08260546999078933, "language_loss": 0.86167276, "learning_rate": 2.5620592601968028e-05, "loss": 0.872123, "num_input_tokens_seen": 387574160, "router_z_loss_mlp": 0.0869751, "routerloss_mlp": 0.0, "step": 4682, "time_per_iteration": 2.5562498569488525 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104895, "balance_loss_mlp": 1.04019439, "diversity_loss_mlp": 0.0, "epoch": 0.9009234320892651, "flos": 652901617152.0, "grad_norm": 0.07052497776440367, "language_loss": 0.78726637, "learning_rate": 2.5522236972946532e-05, "loss": 0.79775584, "num_input_tokens_seen": 387652528, "router_z_loss_mlp": 0.08764648, "routerloss_mlp": 0.0, "step": 4683, "time_per_iteration": 2.8474693298339844 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045566, "balance_loss_mlp": 1.03673279, "diversity_loss_mlp": 0.0, "epoch": 0.9011158137745287, "flos": 545569178112.0, "grad_norm": 0.058284794620577896, "language_loss": 0.84781289, "learning_rate": 2.5424065552529295e-05, "loss": 0.85826856, "num_input_tokens_seen": 387723520, "router_z_loss_mlp": 0.08843994, "routerloss_mlp": 0.0, "step": 4684, "time_per_iteration": 2.6325201988220215 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045889, "balance_loss_mlp": 1.03712106, "diversity_loss_mlp": 0.0, "epoch": 0.9013081954597922, "flos": 559699808256.0, "grad_norm": 0.07314098955075891, "language_loss": 0.82745099, "learning_rate": 2.532607837883011e-05, "loss": 0.83790988, "num_input_tokens_seen": 387793664, "router_z_loss_mlp": 0.08770752, "routerloss_mlp": 0.0, "step": 4685, "time_per_iteration": 2.7466516494750977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104569, "balance_loss_mlp": 1.03673732, "diversity_loss_mlp": 0.0, "epoch": 0.9015005771450558, "flos": 728652132864.0, "grad_norm": 0.06299423790772288, "language_loss": 0.81375784, "learning_rate": 2.5228275489890706e-05, "loss": 0.82421476, "num_input_tokens_seen": 387871008, "router_z_loss_mlp": 0.08947754, "routerloss_mlp": 0.0, "step": 4686, "time_per_iteration": 2.8934953212738037 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048153, "balance_loss_mlp": 1.03924799, "diversity_loss_mlp": 0.0, "epoch": 0.9016929588303193, "flos": 517416574464.0, "grad_norm": 0.07273312761869775, "language_loss": 0.81357133, "learning_rate": 2.5130656923681605e-05, "loss": 0.82405281, "num_input_tokens_seen": 387950832, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 4687, "time_per_iteration": 2.7839083671569824 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046931, "balance_loss_mlp": 1.03816867, "diversity_loss_mlp": 0.0, "epoch": 0.9018853405155829, "flos": 622335052800.0, "grad_norm": 0.05747241213566195, "language_loss": 0.86223972, "learning_rate": 2.503322271810171e-05, "loss": 0.87270904, "num_input_tokens_seen": 388029792, "router_z_loss_mlp": 0.08764648, "routerloss_mlp": 0.0, "step": 4688, "time_per_iteration": 2.8053431510925293 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048787, "balance_loss_mlp": 1.03985882, "diversity_loss_mlp": 0.0, "epoch": 0.9020777222008465, "flos": 523284820992.0, "grad_norm": 0.06831532416346216, "language_loss": 0.77670169, "learning_rate": 2.4935972910978378e-05, "loss": 0.78718954, "num_input_tokens_seen": 388095872, "router_z_loss_mlp": 0.08935547, "routerloss_mlp": 0.0, "step": 4689, "time_per_iteration": 2.6122989654541016 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045572, "balance_loss_mlp": 1.03666687, "diversity_loss_mlp": 0.0, "epoch": 0.9022701038861101, "flos": 633713269248.0, "grad_norm": 0.05580417916624313, "language_loss": 0.81750822, "learning_rate": 2.4838907540067346e-05, "loss": 0.82796389, "num_input_tokens_seen": 388171632, "router_z_loss_mlp": 0.08905029, "routerloss_mlp": 0.0, "step": 4690, "time_per_iteration": 2.8226675987243652 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049641, "balance_loss_mlp": 1.04086757, "diversity_loss_mlp": 0.0, "epoch": 0.9024624855713737, "flos": 513295294464.0, "grad_norm": 0.07066245461166361, "language_loss": 0.84397352, "learning_rate": 2.474202664305253e-05, "loss": 0.8544699, "num_input_tokens_seen": 388242240, "router_z_loss_mlp": 0.08789062, "routerloss_mlp": 0.0, "step": 4691, "time_per_iteration": 2.608060359954834 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046874, "balance_loss_mlp": 1.03758168, "diversity_loss_mlp": 0.0, "epoch": 0.9026548672566371, "flos": 477411480576.0, "grad_norm": 0.06466025971704324, "language_loss": 0.86426198, "learning_rate": 2.464533025754673e-05, "loss": 0.87473077, "num_input_tokens_seen": 388310960, "router_z_loss_mlp": 0.09283447, "routerloss_mlp": 0.0, "step": 4692, "time_per_iteration": 2.63151216506958 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047537, "balance_loss_mlp": 1.0386498, "diversity_loss_mlp": 0.0, "epoch": 0.9028472489419007, "flos": 661994353152.0, "grad_norm": 0.06521986088761798, "language_loss": 0.73844278, "learning_rate": 2.454881842109058e-05, "loss": 0.74891818, "num_input_tokens_seen": 388387280, "router_z_loss_mlp": 0.08886719, "routerloss_mlp": 0.0, "step": 4693, "time_per_iteration": 2.833467483520508 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048441, "balance_loss_mlp": 1.0395714, "diversity_loss_mlp": 0.0, "epoch": 0.9030396306271643, "flos": 534588885504.0, "grad_norm": 0.07181614420601379, "language_loss": 0.82029641, "learning_rate": 2.4452491171153445e-05, "loss": 0.8307808, "num_input_tokens_seen": 388456992, "router_z_loss_mlp": 0.08874512, "routerloss_mlp": 0.0, "step": 4694, "time_per_iteration": 2.6215834617614746 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050504, "balance_loss_mlp": 1.04152727, "diversity_loss_mlp": 0.0, "epoch": 0.9032320123124279, "flos": 801032426496.0, "grad_norm": 0.07933043955400586, "language_loss": 0.8251496, "learning_rate": 2.43563485451328e-05, "loss": 0.83565462, "num_input_tokens_seen": 388534896, "router_z_loss_mlp": 0.08978271, "routerloss_mlp": 0.0, "step": 4695, "time_per_iteration": 2.9662675857543945 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045657, "balance_loss_mlp": 1.03683591, "diversity_loss_mlp": 0.0, "epoch": 0.9034243939976914, "flos": 553942789632.0, "grad_norm": 0.08647194091584645, "language_loss": 0.76889336, "learning_rate": 2.426039058035451e-05, "loss": 0.77934992, "num_input_tokens_seen": 388606640, "router_z_loss_mlp": 0.08825684, "routerloss_mlp": 0.0, "step": 4696, "time_per_iteration": 2.6831114292144775 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046387, "balance_loss_mlp": 1.03765512, "diversity_loss_mlp": 0.0, "epoch": 0.903616775682955, "flos": 503903752704.0, "grad_norm": 0.06589427726191109, "language_loss": 0.82852316, "learning_rate": 2.4164617314072823e-05, "loss": 0.83898699, "num_input_tokens_seen": 388675920, "router_z_loss_mlp": 0.08740234, "routerloss_mlp": 0.0, "step": 4697, "time_per_iteration": 2.606084108352661 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046118, "balance_loss_mlp": 1.03745151, "diversity_loss_mlp": 0.0, "epoch": 0.9038091573682185, "flos": 436297052160.0, "grad_norm": 0.07072654359751072, "language_loss": 0.79079431, "learning_rate": 2.406902878347017e-05, "loss": 0.80125546, "num_input_tokens_seen": 388743968, "router_z_loss_mlp": 0.08666992, "routerloss_mlp": 0.0, "step": 4698, "time_per_iteration": 2.6087543964385986 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049163, "balance_loss_mlp": 1.03998375, "diversity_loss_mlp": 0.0, "epoch": 0.9040015390534821, "flos": 532916070912.0, "grad_norm": 0.08844604656187115, "language_loss": 0.81696689, "learning_rate": 2.3973625025657253e-05, "loss": 0.8274585, "num_input_tokens_seen": 388810784, "router_z_loss_mlp": 0.09173584, "routerloss_mlp": 0.0, "step": 4699, "time_per_iteration": 2.6180419921875 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044344, "balance_loss_mlp": 1.03545141, "diversity_loss_mlp": 0.0, "epoch": 0.9041939207387457, "flos": 564307845120.0, "grad_norm": 0.06789594949929362, "language_loss": 0.80433279, "learning_rate": 2.3878406077673275e-05, "loss": 0.81477618, "num_input_tokens_seen": 388885072, "router_z_loss_mlp": 0.08892822, "routerloss_mlp": 0.0, "step": 4700, "time_per_iteration": 2.8047759532928467 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046422, "balance_loss_mlp": 1.03756499, "diversity_loss_mlp": 0.0, "epoch": 0.9043863024240092, "flos": 515509194240.0, "grad_norm": 0.07594330446268198, "language_loss": 0.77877766, "learning_rate": 2.3783371976485447e-05, "loss": 0.78924191, "num_input_tokens_seen": 388951184, "router_z_loss_mlp": 0.08862305, "routerloss_mlp": 0.0, "step": 4701, "time_per_iteration": 2.5752878189086914 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01003251, "balance_loss_mlp": 0.99879241, "diversity_loss_mlp": 0.0, "epoch": 0.9045786841092728, "flos": 1277949063168.0, "grad_norm": 0.003648556595750329, "language_loss": 0.72929788, "learning_rate": 2.368852275898914e-05, "loss": 0.73933041, "num_input_tokens_seen": 389170752, "router_z_loss_mlp": 0.04467773, "routerloss_mlp": 0.0, "step": 4702, "time_per_iteration": 4.9735963344573975 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050029, "balance_loss_mlp": 1.04117787, "diversity_loss_mlp": 0.0, "epoch": 0.9047710657945364, "flos": 585841144320.0, "grad_norm": 0.08131986828145982, "language_loss": 0.8338269, "learning_rate": 2.3593858462008178e-05, "loss": 0.84432721, "num_input_tokens_seen": 389239600, "router_z_loss_mlp": 0.08850098, "routerloss_mlp": 0.0, "step": 4703, "time_per_iteration": 2.736764430999756 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045539, "balance_loss_mlp": 1.0364728, "diversity_loss_mlp": 0.0, "epoch": 0.9049634474798, "flos": 571937739264.0, "grad_norm": 0.085064980666539, "language_loss": 0.79620826, "learning_rate": 2.3499379122294495e-05, "loss": 0.80666363, "num_input_tokens_seen": 389316032, "router_z_loss_mlp": 0.09063721, "routerloss_mlp": 0.0, "step": 4704, "time_per_iteration": 2.7620725631713867 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010488, "balance_loss_mlp": 1.04021692, "diversity_loss_mlp": 0.0, "epoch": 0.9051558291650635, "flos": 572619787776.0, "grad_norm": 0.08171845507100765, "language_loss": 0.74530506, "learning_rate": 2.3405084776528307e-05, "loss": 0.75579304, "num_input_tokens_seen": 389383504, "router_z_loss_mlp": 0.0859375, "routerloss_mlp": 0.0, "step": 4705, "time_per_iteration": 2.6691336631774902 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048986, "balance_loss_mlp": 1.0402658, "diversity_loss_mlp": 0.0, "epoch": 0.905348210850327, "flos": 540538624512.0, "grad_norm": 0.08031830917867225, "language_loss": 0.79134667, "learning_rate": 2.331097546131783e-05, "loss": 0.80183655, "num_input_tokens_seen": 389454592, "router_z_loss_mlp": 0.08728027, "routerloss_mlp": 0.0, "step": 4706, "time_per_iteration": 2.657421350479126 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049352, "balance_loss_mlp": 1.04074478, "diversity_loss_mlp": 0.0, "epoch": 0.9055405925355906, "flos": 516381391872.0, "grad_norm": 0.07852771434357471, "language_loss": 0.81530303, "learning_rate": 2.321705121319956e-05, "loss": 0.82579654, "num_input_tokens_seen": 389519696, "router_z_loss_mlp": 0.08618164, "routerloss_mlp": 0.0, "step": 4707, "time_per_iteration": 2.6042165756225586 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045171, "balance_loss_mlp": 1.03612292, "diversity_loss_mlp": 0.0, "epoch": 0.9057329742208542, "flos": 914643145728.0, "grad_norm": 0.052073742250211955, "language_loss": 0.85184813, "learning_rate": 2.3123312068638104e-05, "loss": 0.86229986, "num_input_tokens_seen": 389603568, "router_z_loss_mlp": 0.09051514, "routerloss_mlp": 0.0, "step": 4708, "time_per_iteration": 3.205712080001831 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048052, "balance_loss_mlp": 1.03921902, "diversity_loss_mlp": 0.0, "epoch": 0.9059253559061178, "flos": 905261515776.0, "grad_norm": 0.07208392805658173, "language_loss": 0.83473063, "learning_rate": 2.3029758064026295e-05, "loss": 0.84521115, "num_input_tokens_seen": 389687504, "router_z_loss_mlp": 0.08843994, "routerloss_mlp": 0.0, "step": 4709, "time_per_iteration": 3.15082049369812 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046721, "balance_loss_mlp": 1.03755391, "diversity_loss_mlp": 0.0, "epoch": 0.9061177375913813, "flos": 664534222848.0, "grad_norm": 0.09897458123618827, "language_loss": 0.77498788, "learning_rate": 2.2936389235684918e-05, "loss": 0.78545511, "num_input_tokens_seen": 389764880, "router_z_loss_mlp": 0.09161377, "routerloss_mlp": 0.0, "step": 4710, "time_per_iteration": 2.856567144393921 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047626, "balance_loss_mlp": 1.03875113, "diversity_loss_mlp": 0.0, "epoch": 0.9063101192766448, "flos": 565609900032.0, "grad_norm": 0.06579655928741501, "language_loss": 0.82653207, "learning_rate": 2.2843205619862972e-05, "loss": 0.83700836, "num_input_tokens_seen": 389838304, "router_z_loss_mlp": 0.08880615, "routerloss_mlp": 0.0, "step": 4711, "time_per_iteration": 2.8177871704101562 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044628, "balance_loss_mlp": 1.03596139, "diversity_loss_mlp": 0.0, "epoch": 0.9065025009619084, "flos": 727377242112.0, "grad_norm": 0.06583523405134029, "language_loss": 0.78812146, "learning_rate": 2.2750207252737742e-05, "loss": 0.79856777, "num_input_tokens_seen": 389908592, "router_z_loss_mlp": 0.08673096, "routerloss_mlp": 0.0, "step": 4712, "time_per_iteration": 2.880993604660034 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010452, "balance_loss_mlp": 1.03643262, "diversity_loss_mlp": 0.0, "epoch": 0.906694882647172, "flos": 531512699904.0, "grad_norm": 0.07415444506941751, "language_loss": 0.80136561, "learning_rate": 2.265739417041418e-05, "loss": 0.81181759, "num_input_tokens_seen": 389979040, "router_z_loss_mlp": 0.08770752, "routerloss_mlp": 0.0, "step": 4713, "time_per_iteration": 2.627692937850952 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046054, "balance_loss_mlp": 1.03697634, "diversity_loss_mlp": 0.0, "epoch": 0.9068872643324356, "flos": 429788975616.0, "grad_norm": 0.06943776230353088, "language_loss": 0.84932685, "learning_rate": 2.2564766408925574e-05, "loss": 0.85978746, "num_input_tokens_seen": 390046080, "router_z_loss_mlp": 0.09075928, "routerloss_mlp": 0.0, "step": 4714, "time_per_iteration": 2.5953822135925293 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049292, "balance_loss_mlp": 1.04006529, "diversity_loss_mlp": 0.0, "epoch": 0.9070796460176991, "flos": 588366332928.0, "grad_norm": 0.07092231807138824, "language_loss": 0.79715693, "learning_rate": 2.2472324004233214e-05, "loss": 0.80764985, "num_input_tokens_seen": 390122176, "router_z_loss_mlp": 0.09228516, "routerloss_mlp": 0.0, "step": 4715, "time_per_iteration": 2.7853944301605225 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047488, "balance_loss_mlp": 1.03861296, "diversity_loss_mlp": 0.0, "epoch": 0.9072720277029627, "flos": 571582033920.0, "grad_norm": 0.08464437568581946, "language_loss": 0.7548542, "learning_rate": 2.2380066992226446e-05, "loss": 0.765329, "num_input_tokens_seen": 390195216, "router_z_loss_mlp": 0.08886719, "routerloss_mlp": 0.0, "step": 4716, "time_per_iteration": 2.7233853340148926 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046416, "balance_loss_mlp": 1.03780317, "diversity_loss_mlp": 0.0, "epoch": 0.9074644093882263, "flos": 555798412800.0, "grad_norm": 0.07842659824105606, "language_loss": 0.88551593, "learning_rate": 2.2287995408722617e-05, "loss": 0.89598, "num_input_tokens_seen": 390263216, "router_z_loss_mlp": 0.08612061, "routerloss_mlp": 0.0, "step": 4717, "time_per_iteration": 2.66381573677063 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047464, "balance_loss_mlp": 1.03856564, "diversity_loss_mlp": 0.0, "epoch": 0.9076567910734898, "flos": 640994798592.0, "grad_norm": 0.06367124229028898, "language_loss": 0.82281721, "learning_rate": 2.2196109289467083e-05, "loss": 0.83329189, "num_input_tokens_seen": 390337360, "router_z_loss_mlp": 0.08905029, "routerloss_mlp": 0.0, "step": 4718, "time_per_iteration": 2.7830944061279297 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047901, "balance_loss_mlp": 1.03901446, "diversity_loss_mlp": 0.0, "epoch": 0.9078491727587533, "flos": 733998744576.0, "grad_norm": 0.07734106151470267, "language_loss": 0.81955713, "learning_rate": 2.2104408670133193e-05, "loss": 0.83003616, "num_input_tokens_seen": 390427728, "router_z_loss_mlp": 0.08898926, "routerloss_mlp": 0.0, "step": 4719, "time_per_iteration": 3.1287927627563477 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046133, "balance_loss_mlp": 1.03729379, "diversity_loss_mlp": 0.0, "epoch": 0.9080415544440169, "flos": 654774492672.0, "grad_norm": 0.060901042499375765, "language_loss": 0.86802292, "learning_rate": 2.2012893586322245e-05, "loss": 0.87848425, "num_input_tokens_seen": 390504736, "router_z_loss_mlp": 0.08850098, "routerloss_mlp": 0.0, "step": 4720, "time_per_iteration": 2.8568358421325684 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01041752, "balance_loss_mlp": 1.03286505, "diversity_loss_mlp": 0.0, "epoch": 0.9082339361292805, "flos": 597463838208.0, "grad_norm": 0.06480953268672687, "language_loss": 0.79562217, "learning_rate": 2.1921564073563604e-05, "loss": 0.80603969, "num_input_tokens_seen": 390582048, "router_z_loss_mlp": 0.08892822, "routerloss_mlp": 0.0, "step": 4721, "time_per_iteration": 2.7394514083862305 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104847, "balance_loss_mlp": 1.03955877, "diversity_loss_mlp": 0.0, "epoch": 0.9084263178145441, "flos": 504407761920.0, "grad_norm": 0.09226866260525313, "language_loss": 0.84760112, "learning_rate": 2.183042016731457e-05, "loss": 0.85808581, "num_input_tokens_seen": 390652976, "router_z_loss_mlp": 0.08917236, "routerloss_mlp": 0.0, "step": 4722, "time_per_iteration": 2.616605281829834 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046083, "balance_loss_mlp": 1.03716016, "diversity_loss_mlp": 0.0, "epoch": 0.9086186994998077, "flos": 550031482368.0, "grad_norm": 0.07637156979590433, "language_loss": 0.80386579, "learning_rate": 2.1739461902960223e-05, "loss": 0.81432664, "num_input_tokens_seen": 390726832, "router_z_loss_mlp": 0.0892334, "routerloss_mlp": 0.0, "step": 4723, "time_per_iteration": 2.740421772003174 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045863, "balance_loss_mlp": 1.03707719, "diversity_loss_mlp": 0.0, "epoch": 0.9088110811850711, "flos": 1134076847616.0, "grad_norm": 0.06514717136506207, "language_loss": 0.75284863, "learning_rate": 2.1648689315813763e-05, "loss": 0.76330721, "num_input_tokens_seen": 390824480, "router_z_loss_mlp": 0.08795166, "routerloss_mlp": 0.0, "step": 4724, "time_per_iteration": 3.563429117202759 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052369, "balance_loss_mlp": 1.0434463, "diversity_loss_mlp": 0.0, "epoch": 0.9090034628703347, "flos": 556991811072.0, "grad_norm": 0.06971007170583818, "language_loss": 0.76744211, "learning_rate": 2.155810244111628e-05, "loss": 0.77796578, "num_input_tokens_seen": 390897552, "router_z_loss_mlp": 0.08935547, "routerloss_mlp": 0.0, "step": 4725, "time_per_iteration": 2.658780336380005 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052144, "balance_loss_mlp": 1.0433048, "diversity_loss_mlp": 0.0, "epoch": 0.9091958445555983, "flos": 543970515456.0, "grad_norm": 0.06413099042531242, "language_loss": 0.84407449, "learning_rate": 2.146770131403658e-05, "loss": 0.8545959, "num_input_tokens_seen": 390969008, "router_z_loss_mlp": 0.08850098, "routerloss_mlp": 0.0, "step": 4726, "time_per_iteration": 2.6778671741485596 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049216, "balance_loss_mlp": 1.04029298, "diversity_loss_mlp": 0.0, "epoch": 0.9093882262408619, "flos": 526113957888.0, "grad_norm": 0.07280363304099743, "language_loss": 0.81181479, "learning_rate": 2.1377485969671594e-05, "loss": 0.82230693, "num_input_tokens_seen": 391038880, "router_z_loss_mlp": 0.08935547, "routerloss_mlp": 0.0, "step": 4727, "time_per_iteration": 2.6568636894226074 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051071, "balance_loss_mlp": 1.04238701, "diversity_loss_mlp": 0.0, "epoch": 0.9095806079261254, "flos": 548526795264.0, "grad_norm": 0.0725280737417026, "language_loss": 0.81922674, "learning_rate": 2.1287456443046084e-05, "loss": 0.82973742, "num_input_tokens_seen": 391106720, "router_z_loss_mlp": 0.08691406, "routerloss_mlp": 0.0, "step": 4728, "time_per_iteration": 2.643022060394287 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044931, "balance_loss_mlp": 1.03610396, "diversity_loss_mlp": 0.0, "epoch": 0.909772989611389, "flos": 572535724032.0, "grad_norm": 0.0673800156354799, "language_loss": 0.84635472, "learning_rate": 2.1197612769112528e-05, "loss": 0.85680401, "num_input_tokens_seen": 391178128, "router_z_loss_mlp": 0.08831787, "routerloss_mlp": 0.0, "step": 4729, "time_per_iteration": 2.724855661392212 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048461, "balance_loss_mlp": 1.03952658, "diversity_loss_mlp": 0.0, "epoch": 0.9099653712966526, "flos": 561812391936.0, "grad_norm": 0.07330494114530435, "language_loss": 0.79589331, "learning_rate": 2.1107954982751254e-05, "loss": 0.80637789, "num_input_tokens_seen": 391248848, "router_z_loss_mlp": 0.08935547, "routerloss_mlp": 0.0, "step": 4730, "time_per_iteration": 2.665303945541382 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047478, "balance_loss_mlp": 1.03856754, "diversity_loss_mlp": 0.0, "epoch": 0.9101577529819161, "flos": 1093800112128.0, "grad_norm": 0.078385767023693, "language_loss": 0.80267072, "learning_rate": 2.101848311877069e-05, "loss": 0.81314552, "num_input_tokens_seen": 391328000, "router_z_loss_mlp": 0.08917236, "routerloss_mlp": 0.0, "step": 4731, "time_per_iteration": 3.366713762283325 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046528, "balance_loss_mlp": 1.03736663, "diversity_loss_mlp": 0.0, "epoch": 0.9103501346671797, "flos": 445444116480.0, "grad_norm": 0.08027492001685438, "language_loss": 0.81851661, "learning_rate": 2.092919721190678e-05, "loss": 0.82898188, "num_input_tokens_seen": 391391616, "router_z_loss_mlp": 0.09161377, "routerloss_mlp": 0.0, "step": 4732, "time_per_iteration": 2.511289119720459 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052991, "balance_loss_mlp": 1.04403806, "diversity_loss_mlp": 0.0, "epoch": 0.9105425163524432, "flos": 500770667520.0, "grad_norm": 0.07912673976757961, "language_loss": 0.77801937, "learning_rate": 2.0840097296823346e-05, "loss": 0.7885493, "num_input_tokens_seen": 391461312, "router_z_loss_mlp": 0.08959961, "routerloss_mlp": 0.0, "step": 4733, "time_per_iteration": 2.6270110607147217 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048563, "balance_loss_mlp": 1.03949749, "diversity_loss_mlp": 0.0, "epoch": 0.9107348980377068, "flos": 657519565824.0, "grad_norm": 0.055649375090756015, "language_loss": 0.84341621, "learning_rate": 2.0751183408112162e-05, "loss": 0.85390186, "num_input_tokens_seen": 391542192, "router_z_loss_mlp": 0.09075928, "routerloss_mlp": 0.0, "step": 4734, "time_per_iteration": 2.8428561687469482 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048531, "balance_loss_mlp": 1.0395968, "diversity_loss_mlp": 0.0, "epoch": 0.9109272797229704, "flos": 553668576768.0, "grad_norm": 0.07562354165732797, "language_loss": 0.84999311, "learning_rate": 2.066245558029256e-05, "loss": 0.8604784, "num_input_tokens_seen": 391609968, "router_z_loss_mlp": 0.08947754, "routerloss_mlp": 0.0, "step": 4735, "time_per_iteration": 2.617300033569336 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047339, "balance_loss_mlp": 1.03857076, "diversity_loss_mlp": 0.0, "epoch": 0.911119661408234, "flos": 519007896576.0, "grad_norm": 0.06845754764753385, "language_loss": 0.84216273, "learning_rate": 2.057391384781182e-05, "loss": 0.8526361, "num_input_tokens_seen": 391681264, "router_z_loss_mlp": 0.08776855, "routerloss_mlp": 0.0, "step": 4736, "time_per_iteration": 2.621656894683838 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053341, "balance_loss_mlp": 1.04450214, "diversity_loss_mlp": 0.0, "epoch": 0.9113120430934974, "flos": 554375218176.0, "grad_norm": 0.07185753448877732, "language_loss": 0.83150327, "learning_rate": 2.0485558245044834e-05, "loss": 0.8420366, "num_input_tokens_seen": 391751392, "router_z_loss_mlp": 0.08850098, "routerloss_mlp": 0.0, "step": 4737, "time_per_iteration": 2.6248881816864014 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052271, "balance_loss_mlp": 1.04334199, "diversity_loss_mlp": 0.0, "epoch": 0.911504424778761, "flos": 501889913856.0, "grad_norm": 0.06362345813560902, "language_loss": 0.81097478, "learning_rate": 2.0397388806294216e-05, "loss": 0.8214975, "num_input_tokens_seen": 391823952, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 4738, "time_per_iteration": 2.6537606716156006 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050862, "balance_loss_mlp": 1.0419693, "diversity_loss_mlp": 0.0, "epoch": 0.9116968064640246, "flos": 611100370944.0, "grad_norm": 0.06023003948048014, "language_loss": 0.81882358, "learning_rate": 2.0309405565790527e-05, "loss": 0.82933223, "num_input_tokens_seen": 391895264, "router_z_loss_mlp": 0.08898926, "routerloss_mlp": 0.0, "step": 4739, "time_per_iteration": 2.7091641426086426 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047604, "balance_loss_mlp": 1.03856826, "diversity_loss_mlp": 0.0, "epoch": 0.9118891881492882, "flos": 572918593536.0, "grad_norm": 0.06392422998543029, "language_loss": 0.82626665, "learning_rate": 2.0221608557691895e-05, "loss": 0.8367427, "num_input_tokens_seen": 391973040, "router_z_loss_mlp": 0.0904541, "routerloss_mlp": 0.0, "step": 4740, "time_per_iteration": 2.762544631958008 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049849, "balance_loss_mlp": 1.04099774, "diversity_loss_mlp": 0.0, "epoch": 0.9120815698345518, "flos": 635961673728.0, "grad_norm": 0.0822598036225358, "language_loss": 0.78046763, "learning_rate": 2.0133997816083992e-05, "loss": 0.79096615, "num_input_tokens_seen": 392048160, "router_z_loss_mlp": 0.08856201, "routerloss_mlp": 0.0, "step": 4741, "time_per_iteration": 2.84562087059021 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050163, "balance_loss_mlp": 1.04134798, "diversity_loss_mlp": 0.0, "epoch": 0.9122739515198153, "flos": 702300824064.0, "grad_norm": 0.06551662933562434, "language_loss": 0.857319, "learning_rate": 2.0046573374980447e-05, "loss": 0.86782068, "num_input_tokens_seen": 392128960, "router_z_loss_mlp": 0.08825684, "routerloss_mlp": 0.0, "step": 4742, "time_per_iteration": 2.8531861305236816 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050211, "balance_loss_mlp": 1.04143143, "diversity_loss_mlp": 0.0, "epoch": 0.9124663332050789, "flos": 524690763264.0, "grad_norm": 0.08699441594773756, "language_loss": 0.87479031, "learning_rate": 1.995933526832239e-05, "loss": 0.88529241, "num_input_tokens_seen": 392195008, "router_z_loss_mlp": 0.08789062, "routerloss_mlp": 0.0, "step": 4743, "time_per_iteration": 2.650739908218384 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049409, "balance_loss_mlp": 1.04080176, "diversity_loss_mlp": 0.0, "epoch": 0.9126587148903424, "flos": 563299826688.0, "grad_norm": 0.06693150560912724, "language_loss": 0.826424, "learning_rate": 1.9872283529978662e-05, "loss": 0.83691812, "num_input_tokens_seen": 392265168, "router_z_loss_mlp": 0.08612061, "routerloss_mlp": 0.0, "step": 4744, "time_per_iteration": 2.679450035095215 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045731, "balance_loss_mlp": 1.03671229, "diversity_loss_mlp": 0.0, "epoch": 0.912851096575606, "flos": 505942184448.0, "grad_norm": 0.08010451753321661, "language_loss": 0.79965168, "learning_rate": 1.978541819374574e-05, "loss": 0.81010902, "num_input_tokens_seen": 392329456, "router_z_loss_mlp": 0.09020996, "routerloss_mlp": 0.0, "step": 4745, "time_per_iteration": 2.5925939083099365 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048486, "balance_loss_mlp": 1.03974199, "diversity_loss_mlp": 0.0, "epoch": 0.9130434782608695, "flos": 550730783232.0, "grad_norm": 0.06455396152064795, "language_loss": 0.82245004, "learning_rate": 1.9698739293347755e-05, "loss": 0.83293486, "num_input_tokens_seen": 392397792, "router_z_loss_mlp": 0.08752441, "routerloss_mlp": 0.0, "step": 4746, "time_per_iteration": 2.6314661502838135 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049845, "balance_loss_mlp": 1.04123759, "diversity_loss_mlp": 0.0, "epoch": 0.9132358599461331, "flos": 468976200192.0, "grad_norm": 0.06909556408267023, "language_loss": 0.83497131, "learning_rate": 1.9612246862436456e-05, "loss": 0.84546977, "num_input_tokens_seen": 392462928, "router_z_loss_mlp": 0.08618164, "routerloss_mlp": 0.0, "step": 4747, "time_per_iteration": 2.5474631786346436 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046479, "balance_loss_mlp": 1.03760934, "diversity_loss_mlp": 0.0, "epoch": 0.9134282416313967, "flos": 506097828864.0, "grad_norm": 0.07312632583700283, "language_loss": 0.79836029, "learning_rate": 1.9525940934591148e-05, "loss": 0.80882508, "num_input_tokens_seen": 392531840, "router_z_loss_mlp": 0.08868408, "routerloss_mlp": 0.0, "step": 4748, "time_per_iteration": 2.680522918701172 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050241, "balance_loss_mlp": 1.04132986, "diversity_loss_mlp": 0.0, "epoch": 0.9136206233166603, "flos": 604819519488.0, "grad_norm": 0.06502832751654097, "language_loss": 0.83780789, "learning_rate": 1.9439821543318748e-05, "loss": 0.84831029, "num_input_tokens_seen": 392602464, "router_z_loss_mlp": 0.08917236, "routerloss_mlp": 0.0, "step": 4749, "time_per_iteration": 2.7452023029327393 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050292, "balance_loss_mlp": 1.04147661, "diversity_loss_mlp": 0.0, "epoch": 0.9138130050019239, "flos": 561738240000.0, "grad_norm": 0.07375447300189412, "language_loss": 0.82539463, "learning_rate": 1.9353888722053793e-05, "loss": 0.83589756, "num_input_tokens_seen": 392669872, "router_z_loss_mlp": 0.08825684, "routerloss_mlp": 0.0, "step": 4750, "time_per_iteration": 2.6701533794403076 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105006, "balance_loss_mlp": 1.04132831, "diversity_loss_mlp": 0.0, "epoch": 0.9140053866871873, "flos": 690117221376.0, "grad_norm": 0.06117546898764861, "language_loss": 0.90313232, "learning_rate": 1.9268142504158426e-05, "loss": 0.91363287, "num_input_tokens_seen": 392744256, "router_z_loss_mlp": 0.08734131, "routerloss_mlp": 0.0, "step": 4751, "time_per_iteration": 2.8322813510894775 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01042652, "balance_loss_mlp": 1.03372943, "diversity_loss_mlp": 0.0, "epoch": 0.9141977683724509, "flos": 551012336640.0, "grad_norm": 0.05974577392766342, "language_loss": 0.84016383, "learning_rate": 1.9182582922922186e-05, "loss": 0.85059029, "num_input_tokens_seen": 392816832, "router_z_loss_mlp": 0.08917236, "routerloss_mlp": 0.0, "step": 4752, "time_per_iteration": 2.688077449798584 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050001, "balance_loss_mlp": 1.04098237, "diversity_loss_mlp": 0.0, "epoch": 0.9143901500577145, "flos": 540088943616.0, "grad_norm": 0.06413328541809935, "language_loss": 0.75752521, "learning_rate": 1.9097210011562228e-05, "loss": 0.76802522, "num_input_tokens_seen": 392886304, "router_z_loss_mlp": 0.09020996, "routerloss_mlp": 0.0, "step": 4753, "time_per_iteration": 2.650331974029541 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047101, "balance_loss_mlp": 1.03808916, "diversity_loss_mlp": 0.0, "epoch": 0.9145825317429781, "flos": 528767626752.0, "grad_norm": 0.08121838802327101, "language_loss": 0.80860132, "learning_rate": 1.9012023803223366e-05, "loss": 0.81907237, "num_input_tokens_seen": 392955872, "router_z_loss_mlp": 0.09020996, "routerloss_mlp": 0.0, "step": 4754, "time_per_iteration": 2.6111409664154053 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051972, "balance_loss_mlp": 1.04308462, "diversity_loss_mlp": 0.0, "epoch": 0.9147749134282416, "flos": 514792641024.0, "grad_norm": 0.06557647778558516, "language_loss": 0.79137278, "learning_rate": 1.892702433097776e-05, "loss": 0.80189246, "num_input_tokens_seen": 393025776, "router_z_loss_mlp": 0.08886719, "routerloss_mlp": 0.0, "step": 4755, "time_per_iteration": 2.6349050998687744 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047014, "balance_loss_mlp": 1.0382818, "diversity_loss_mlp": 0.0, "epoch": 0.9149672951135052, "flos": 514441704960.0, "grad_norm": 0.06908775382754948, "language_loss": 0.85741401, "learning_rate": 1.8842211627825233e-05, "loss": 0.8678841, "num_input_tokens_seen": 393095936, "router_z_loss_mlp": 0.08746338, "routerloss_mlp": 0.0, "step": 4756, "time_per_iteration": 2.681579113006592 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045202, "balance_loss_mlp": 1.03613043, "diversity_loss_mlp": 0.0, "epoch": 0.9151596767987688, "flos": 577069608960.0, "grad_norm": 0.06619379563809555, "language_loss": 0.81299222, "learning_rate": 1.8757585726692727e-05, "loss": 0.82344431, "num_input_tokens_seen": 393166816, "router_z_loss_mlp": 0.09069824, "routerloss_mlp": 0.0, "step": 4757, "time_per_iteration": 2.8199880123138428 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104812, "balance_loss_mlp": 1.03948975, "diversity_loss_mlp": 0.0, "epoch": 0.9153520584840323, "flos": 619335590400.0, "grad_norm": 0.07903863840267403, "language_loss": 0.82496881, "learning_rate": 1.8673146660435182e-05, "loss": 0.83544993, "num_input_tokens_seen": 393242176, "router_z_loss_mlp": 0.08642578, "routerloss_mlp": 0.0, "step": 4758, "time_per_iteration": 2.7341158390045166 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00789047, "balance_loss_mlp": 1.333637, "diversity_loss_mlp": 0.22318089, "epoch": 0.9155444401692959, "flos": 468921871872.0, "grad_norm": 0.0321241392563351, "language_loss": 0.83172476, "learning_rate": 1.8588894461834704e-05, "loss": 0.83961523, "num_input_tokens_seen": 393311792, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01063856, "step": 4759, "time_per_iteration": 2.597241163253784 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01008181, "balance_loss_mlp": 1.00372291, "diversity_loss_mlp": 0.0, "epoch": 0.9157368218545594, "flos": 1410711054336.0, "grad_norm": 0.006260194037571693, "language_loss": 0.7481907, "learning_rate": 1.8504829163600855e-05, "loss": 0.75827253, "num_input_tokens_seen": 393535648, "router_z_loss_mlp": 0.04467773, "routerloss_mlp": 0.0, "step": 4760, "time_per_iteration": 4.852627754211426 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01007794, "balance_loss_mlp": 1.00331163, "diversity_loss_mlp": 0.0, "epoch": 0.915929203539823, "flos": 1522019040768.0, "grad_norm": 0.006798931475656377, "language_loss": 0.79576051, "learning_rate": 1.8420950798370584e-05, "loss": 0.80583847, "num_input_tokens_seen": 393767040, "router_z_loss_mlp": 0.04492188, "routerloss_mlp": 0.0, "step": 4761, "time_per_iteration": 4.994880437850952 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047238, "balance_loss_mlp": 1.03847599, "diversity_loss_mlp": 0.0, "epoch": 0.9161215852250866, "flos": 535752548352.0, "grad_norm": 0.05790619573319675, "language_loss": 0.80362964, "learning_rate": 1.8337259398708616e-05, "loss": 0.81410205, "num_input_tokens_seen": 393841232, "router_z_loss_mlp": 0.08764648, "routerloss_mlp": 0.0, "step": 4762, "time_per_iteration": 2.752257823944092 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046406, "balance_loss_mlp": 1.03779316, "diversity_loss_mlp": 0.0, "epoch": 0.9163139669103502, "flos": 590624649216.0, "grad_norm": 0.07895774001894396, "language_loss": 0.8113842, "learning_rate": 1.8253754997106632e-05, "loss": 0.82184827, "num_input_tokens_seen": 393910512, "router_z_loss_mlp": 0.08624268, "routerloss_mlp": 0.0, "step": 4763, "time_per_iteration": 2.7287051677703857 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046516, "balance_loss_mlp": 1.03780174, "diversity_loss_mlp": 0.0, "epoch": 0.9165063485956138, "flos": 821975081472.0, "grad_norm": 0.06309721497849985, "language_loss": 0.8474853, "learning_rate": 1.817043762598397e-05, "loss": 0.85795045, "num_input_tokens_seen": 393988624, "router_z_loss_mlp": 0.08721924, "routerloss_mlp": 0.0, "step": 4764, "time_per_iteration": 3.033647060394287 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047904, "balance_loss_mlp": 1.03908885, "diversity_loss_mlp": 0.0, "epoch": 0.9166987302808772, "flos": 525194772480.0, "grad_norm": 0.06604892374800723, "language_loss": 0.8237828, "learning_rate": 1.8087307317687264e-05, "loss": 0.83426178, "num_input_tokens_seen": 394059184, "router_z_loss_mlp": 0.08825684, "routerloss_mlp": 0.0, "step": 4765, "time_per_iteration": 2.6534650325775146 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047298, "balance_loss_mlp": 1.03842866, "diversity_loss_mlp": 0.0, "epoch": 0.9168911119661408, "flos": 655095693312.0, "grad_norm": 0.05990107828974712, "language_loss": 0.84426653, "learning_rate": 1.800436410449058e-05, "loss": 0.85473955, "num_input_tokens_seen": 394142160, "router_z_loss_mlp": 0.08868408, "routerloss_mlp": 0.0, "step": 4766, "time_per_iteration": 2.907374620437622 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049797, "balance_loss_mlp": 1.04099941, "diversity_loss_mlp": 0.0, "epoch": 0.9170834936514044, "flos": 491747314176.0, "grad_norm": 0.06352266446456978, "language_loss": 0.8504523, "learning_rate": 1.7921608018595436e-05, "loss": 0.86095023, "num_input_tokens_seen": 394207056, "router_z_loss_mlp": 0.08795166, "routerloss_mlp": 0.0, "step": 4767, "time_per_iteration": 2.526810884475708 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052899, "balance_loss_mlp": 1.04373765, "diversity_loss_mlp": 0.0, "epoch": 0.917275875336668, "flos": 628040314368.0, "grad_norm": 0.07650045088890157, "language_loss": 0.80317563, "learning_rate": 1.7839039092130415e-05, "loss": 0.81370461, "num_input_tokens_seen": 394275456, "router_z_loss_mlp": 0.0916748, "routerloss_mlp": 0.0, "step": 4768, "time_per_iteration": 2.8045382499694824 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01006939, "balance_loss_mlp": 1.00245714, "diversity_loss_mlp": 0.0, "epoch": 0.9174682570219315, "flos": 1517981824512.0, "grad_norm": 0.004694640504473852, "language_loss": 0.78180236, "learning_rate": 1.7756657357151762e-05, "loss": 0.79187173, "num_input_tokens_seen": 394503808, "router_z_loss_mlp": 0.04492188, "routerloss_mlp": 0.0, "step": 4769, "time_per_iteration": 5.044682264328003 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045239, "balance_loss_mlp": 1.03626275, "diversity_loss_mlp": 0.0, "epoch": 0.917660638707195, "flos": 560021008896.0, "grad_norm": 0.06781997849214876, "language_loss": 0.85250586, "learning_rate": 1.7674462845642835e-05, "loss": 0.86295819, "num_input_tokens_seen": 394573776, "router_z_loss_mlp": 0.08990479, "routerloss_mlp": 0.0, "step": 4770, "time_per_iteration": 2.691663980484009 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049626, "balance_loss_mlp": 1.04060245, "diversity_loss_mlp": 0.0, "epoch": 0.9178530203924586, "flos": 447252751872.0, "grad_norm": 0.06638212987757935, "language_loss": 0.84090322, "learning_rate": 1.7592455589514387e-05, "loss": 0.85139954, "num_input_tokens_seen": 394637600, "router_z_loss_mlp": 0.090271, "routerloss_mlp": 0.0, "step": 4771, "time_per_iteration": 2.4912991523742676 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048451, "balance_loss_mlp": 1.03953981, "diversity_loss_mlp": 0.0, "epoch": 0.9180454020777222, "flos": 465981507072.0, "grad_norm": 0.06646365406462024, "language_loss": 0.80387986, "learning_rate": 1.7510635620604453e-05, "loss": 0.81436437, "num_input_tokens_seen": 394707344, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 4772, "time_per_iteration": 2.5629544258117676 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051354, "balance_loss_mlp": 1.04250824, "diversity_loss_mlp": 0.0, "epoch": 0.9182377837629858, "flos": 596314856448.0, "grad_norm": 0.06012915212224945, "language_loss": 0.87101483, "learning_rate": 1.74290029706784e-05, "loss": 0.88152838, "num_input_tokens_seen": 394786368, "router_z_loss_mlp": 0.08856201, "routerloss_mlp": 0.0, "step": 4773, "time_per_iteration": 2.7718729972839355 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049092, "balance_loss_mlp": 1.04024041, "diversity_loss_mlp": 0.0, "epoch": 0.9184301654482493, "flos": 996671941632.0, "grad_norm": 0.05995829646518676, "language_loss": 0.8283515, "learning_rate": 1.734755767142876e-05, "loss": 0.83884239, "num_input_tokens_seen": 394876976, "router_z_loss_mlp": 0.08856201, "routerloss_mlp": 0.0, "step": 4774, "time_per_iteration": 3.344503164291382 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051218, "balance_loss_mlp": 1.04242659, "diversity_loss_mlp": 0.0, "epoch": 0.9186225471335129, "flos": 508860154368.0, "grad_norm": 0.06073994859782487, "language_loss": 0.84713805, "learning_rate": 1.7266299754475467e-05, "loss": 0.85765028, "num_input_tokens_seen": 394949024, "router_z_loss_mlp": 0.0880127, "routerloss_mlp": 0.0, "step": 4775, "time_per_iteration": 2.641633987426758 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048807, "balance_loss_mlp": 1.04000342, "diversity_loss_mlp": 0.0, "epoch": 0.9188149288187765, "flos": 940423633920.0, "grad_norm": 0.07386829063235183, "language_loss": 0.79117858, "learning_rate": 1.718522925136551e-05, "loss": 0.80166662, "num_input_tokens_seen": 395044352, "router_z_loss_mlp": 0.08807373, "routerloss_mlp": 0.0, "step": 4776, "time_per_iteration": 3.311635971069336 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044367, "balance_loss_mlp": 1.03558719, "diversity_loss_mlp": 0.0, "epoch": 0.91900731050404, "flos": 583674232320.0, "grad_norm": 0.065220381744787, "language_loss": 0.84085238, "learning_rate": 1.7104346193573484e-05, "loss": 0.85129607, "num_input_tokens_seen": 395113824, "router_z_loss_mlp": 0.08789062, "routerloss_mlp": 0.0, "step": 4777, "time_per_iteration": 2.6673994064331055 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049607, "balance_loss_mlp": 1.04089904, "diversity_loss_mlp": 0.0, "epoch": 0.9191996921893035, "flos": 581213283840.0, "grad_norm": 0.07320352446310975, "language_loss": 0.79461032, "learning_rate": 1.7023650612500828e-05, "loss": 0.8051064, "num_input_tokens_seen": 395184496, "router_z_loss_mlp": 0.0871582, "routerloss_mlp": 0.0, "step": 4778, "time_per_iteration": 2.7164108753204346 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048104, "balance_loss_mlp": 1.03928864, "diversity_loss_mlp": 0.0, "epoch": 0.9193920738745671, "flos": 908935686144.0, "grad_norm": 0.06805017648291643, "language_loss": 0.79739892, "learning_rate": 1.6943142539476374e-05, "loss": 0.80787992, "num_input_tokens_seen": 395263760, "router_z_loss_mlp": 0.08825684, "routerloss_mlp": 0.0, "step": 4779, "time_per_iteration": 3.1064183712005615 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01006109, "balance_loss_mlp": 1.00162721, "diversity_loss_mlp": 0.0, "epoch": 0.9195844555598307, "flos": 1558372359168.0, "grad_norm": 0.003729713968603667, "language_loss": 0.79795396, "learning_rate": 1.686282200575606e-05, "loss": 0.80801499, "num_input_tokens_seen": 395482384, "router_z_loss_mlp": 0.04492188, "routerloss_mlp": 0.0, "step": 4780, "time_per_iteration": 4.670097351074219 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046783, "balance_loss_mlp": 1.03773558, "diversity_loss_mlp": 0.0, "epoch": 0.9197768372450943, "flos": 474053741568.0, "grad_norm": 0.07167718666233086, "language_loss": 0.78371525, "learning_rate": 1.678268904252317e-05, "loss": 0.79418308, "num_input_tokens_seen": 395550384, "router_z_loss_mlp": 0.09051514, "routerloss_mlp": 0.0, "step": 4781, "time_per_iteration": 2.550713300704956 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047938, "balance_loss_mlp": 1.03888965, "diversity_loss_mlp": 0.0, "epoch": 0.9199692189303579, "flos": 857016059904.0, "grad_norm": 0.06622093872641387, "language_loss": 0.84516716, "learning_rate": 1.6702743680888088e-05, "loss": 0.85564649, "num_input_tokens_seen": 395632320, "router_z_loss_mlp": 0.09057617, "routerloss_mlp": 0.0, "step": 4782, "time_per_iteration": 3.2526657581329346 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049444, "balance_loss_mlp": 1.04045606, "diversity_loss_mlp": 0.0, "epoch": 0.9201616006156214, "flos": 504390509568.0, "grad_norm": 0.06845257893605372, "language_loss": 0.77780342, "learning_rate": 1.6622985951888327e-05, "loss": 0.78829783, "num_input_tokens_seen": 395703856, "router_z_loss_mlp": 0.09002686, "routerloss_mlp": 0.0, "step": 4783, "time_per_iteration": 2.6809587478637695 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048183, "balance_loss_mlp": 1.03927171, "diversity_loss_mlp": 0.0, "epoch": 0.9203539823008849, "flos": 548781184512.0, "grad_norm": 0.06867364706040735, "language_loss": 0.85155487, "learning_rate": 1.6543415886488554e-05, "loss": 0.86203671, "num_input_tokens_seen": 395779456, "router_z_loss_mlp": 0.08929443, "routerloss_mlp": 0.0, "step": 4784, "time_per_iteration": 2.7345173358917236 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00787021, "balance_loss_mlp": 1.32680988, "diversity_loss_mlp": 0.22533412, "epoch": 0.9205463639861485, "flos": 540004879872.0, "grad_norm": 0.03407668721721812, "language_loss": 0.82609832, "learning_rate": 1.6464033515580624e-05, "loss": 0.83396852, "num_input_tokens_seen": 395849584, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01094901, "step": 4785, "time_per_iteration": 2.685168504714966 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044838, "balance_loss_mlp": 1.03591502, "diversity_loss_mlp": 0.0, "epoch": 0.9207387456714121, "flos": 799725229056.0, "grad_norm": 0.0666841111034061, "language_loss": 0.77980995, "learning_rate": 1.6384838869983488e-05, "loss": 0.79025835, "num_input_tokens_seen": 395943712, "router_z_loss_mlp": 0.08935547, "routerloss_mlp": 0.0, "step": 4786, "time_per_iteration": 3.038740873336792 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051934, "balance_loss_mlp": 1.04323745, "diversity_loss_mlp": 0.0, "epoch": 0.9209311273566756, "flos": 502848746496.0, "grad_norm": 0.06529131061254304, "language_loss": 0.78631401, "learning_rate": 1.630583198044333e-05, "loss": 0.79683334, "num_input_tokens_seen": 396013168, "router_z_loss_mlp": 0.08703613, "routerloss_mlp": 0.0, "step": 4787, "time_per_iteration": 2.65899658203125 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047226, "balance_loss_mlp": 1.03834498, "diversity_loss_mlp": 0.0, "epoch": 0.9211235090419392, "flos": 569323717632.0, "grad_norm": 0.0788130161570292, "language_loss": 0.8252883, "learning_rate": 1.6227012877633173e-05, "loss": 0.83576053, "num_input_tokens_seen": 396082032, "router_z_loss_mlp": 0.08886719, "routerloss_mlp": 0.0, "step": 4788, "time_per_iteration": 2.6822633743286133 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049948, "balance_loss_mlp": 1.04112673, "diversity_loss_mlp": 0.0, "epoch": 0.9213158907272028, "flos": 806549736960.0, "grad_norm": 0.07410580856976316, "language_loss": 0.82474685, "learning_rate": 1.6148381592153538e-05, "loss": 0.83524632, "num_input_tokens_seen": 396157984, "router_z_loss_mlp": 0.08831787, "routerloss_mlp": 0.0, "step": 4789, "time_per_iteration": 2.9761576652526855 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045718, "balance_loss_mlp": 1.03685474, "diversity_loss_mlp": 0.0, "epoch": 0.9215082724124664, "flos": 490682396160.0, "grad_norm": 0.0657414722313636, "language_loss": 0.76699907, "learning_rate": 1.6069938154531618e-05, "loss": 0.77745622, "num_input_tokens_seen": 396223840, "router_z_loss_mlp": 0.08874512, "routerloss_mlp": 0.0, "step": 4790, "time_per_iteration": 2.523589849472046 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01004074, "balance_loss_mlp": 0.99959189, "diversity_loss_mlp": 0.0, "epoch": 0.9217006540977299, "flos": 1514495232000.0, "grad_norm": 0.003599452207974624, "language_loss": 0.77070266, "learning_rate": 1.599168259522188e-05, "loss": 0.78074342, "num_input_tokens_seen": 396458288, "router_z_loss_mlp": 0.04492188, "routerloss_mlp": 0.0, "step": 4791, "time_per_iteration": 4.9881064891815186 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052288, "balance_loss_mlp": 1.04335308, "diversity_loss_mlp": 0.0, "epoch": 0.9218930357829934, "flos": 743793352704.0, "grad_norm": 0.06705071724600334, "language_loss": 0.76482338, "learning_rate": 1.5913614944605804e-05, "loss": 0.77534628, "num_input_tokens_seen": 396536208, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 4792, "time_per_iteration": 2.9655344486236572 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044613, "balance_loss_mlp": 1.03580952, "diversity_loss_mlp": 0.0, "epoch": 0.922085417468257, "flos": 453036934656.0, "grad_norm": 0.20959696332428077, "language_loss": 0.80366439, "learning_rate": 1.5835735232992032e-05, "loss": 0.81411052, "num_input_tokens_seen": 396599984, "router_z_loss_mlp": 0.08813477, "routerloss_mlp": 0.0, "step": 4793, "time_per_iteration": 2.554954767227173 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044371, "balance_loss_mlp": 1.0355078, "diversity_loss_mlp": 0.0, "epoch": 0.9222777991535206, "flos": 500249405952.0, "grad_norm": 0.07075391253683742, "language_loss": 0.84841311, "learning_rate": 1.575804349061616e-05, "loss": 0.8588568, "num_input_tokens_seen": 396664592, "router_z_loss_mlp": 0.08868408, "routerloss_mlp": 0.0, "step": 4794, "time_per_iteration": 2.5949018001556396 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047616, "balance_loss_mlp": 1.0387888, "diversity_loss_mlp": 0.0, "epoch": 0.9224701808387842, "flos": 527959669248.0, "grad_norm": 0.0784160138888604, "language_loss": 0.79135698, "learning_rate": 1.5680539747640722e-05, "loss": 0.80183321, "num_input_tokens_seen": 396729472, "router_z_loss_mlp": 0.08831787, "routerloss_mlp": 0.0, "step": 4795, "time_per_iteration": 2.598656415939331 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048282, "balance_loss_mlp": 1.03969288, "diversity_loss_mlp": 0.0, "epoch": 0.9226625625240477, "flos": 874640623104.0, "grad_norm": 0.06249472558878416, "language_loss": 0.75247115, "learning_rate": 1.5603224034155315e-05, "loss": 0.76295394, "num_input_tokens_seen": 396810384, "router_z_loss_mlp": 0.08587646, "routerloss_mlp": 0.0, "step": 4796, "time_per_iteration": 3.1448936462402344 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050009, "balance_loss_mlp": 1.04117608, "diversity_loss_mlp": 0.0, "epoch": 0.9228549442093112, "flos": 502774594560.0, "grad_norm": 0.07031980659654383, "language_loss": 0.88239074, "learning_rate": 1.5526096380176657e-05, "loss": 0.89289081, "num_input_tokens_seen": 396875472, "router_z_loss_mlp": 0.08843994, "routerloss_mlp": 0.0, "step": 4797, "time_per_iteration": 2.543046474456787 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00783825, "balance_loss_mlp": 1.32076359, "diversity_loss_mlp": 0.2258461, "epoch": 0.9230473258945748, "flos": 599989026816.0, "grad_norm": 0.030753006157988122, "language_loss": 0.84967744, "learning_rate": 1.544915681564829e-05, "loss": 0.85751569, "num_input_tokens_seen": 396949888, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01051996, "step": 4798, "time_per_iteration": 2.819098949432373 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049115, "balance_loss_mlp": 1.04029381, "diversity_loss_mlp": 0.0, "epoch": 0.9232397075798384, "flos": 822508826112.0, "grad_norm": 0.06926441515905145, "language_loss": 0.79267633, "learning_rate": 1.5372405370440822e-05, "loss": 0.80316746, "num_input_tokens_seen": 397027504, "router_z_loss_mlp": 0.08825684, "routerloss_mlp": 0.0, "step": 4799, "time_per_iteration": 3.0866541862487793 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048573, "balance_loss_mlp": 1.03970361, "diversity_loss_mlp": 0.0, "epoch": 0.923432089265102, "flos": 707030000640.0, "grad_norm": 0.06842232748476472, "language_loss": 0.84939086, "learning_rate": 1.5295842074351805e-05, "loss": 0.85987657, "num_input_tokens_seen": 397101600, "router_z_loss_mlp": 0.08880615, "routerloss_mlp": 0.0, "step": 4800, "time_per_iteration": 2.840742588043213 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048436, "balance_loss_mlp": 1.03941798, "diversity_loss_mlp": 0.0, "epoch": 0.9236244709503655, "flos": 701861054976.0, "grad_norm": 0.07816499010690336, "language_loss": 0.76574665, "learning_rate": 1.5219466957105798e-05, "loss": 0.77623105, "num_input_tokens_seen": 397170880, "router_z_loss_mlp": 0.09020996, "routerloss_mlp": 0.0, "step": 4801, "time_per_iteration": 2.8335320949554443 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050463, "balance_loss_mlp": 1.04159379, "diversity_loss_mlp": 0.0, "epoch": 0.9238168526356291, "flos": 515039689728.0, "grad_norm": 0.06210245880406286, "language_loss": 0.843297, "learning_rate": 1.5143280048354136e-05, "loss": 0.85380167, "num_input_tokens_seen": 397242272, "router_z_loss_mlp": 0.08880615, "routerloss_mlp": 0.0, "step": 4802, "time_per_iteration": 2.6566197872161865 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047457, "balance_loss_mlp": 1.03858757, "diversity_loss_mlp": 0.0, "epoch": 0.9240092343208927, "flos": 492024098304.0, "grad_norm": 0.09058835826894181, "language_loss": 0.81587046, "learning_rate": 1.5067281377675213e-05, "loss": 0.82634509, "num_input_tokens_seen": 397308032, "router_z_loss_mlp": 0.08874512, "routerloss_mlp": 0.0, "step": 4803, "time_per_iteration": 2.6244726181030273 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045538, "balance_loss_mlp": 1.0367403, "diversity_loss_mlp": 0.0, "epoch": 0.9242016160061562, "flos": 647218750464.0, "grad_norm": 0.06939366274556823, "language_loss": 0.73765552, "learning_rate": 1.4991470974574484e-05, "loss": 0.74811089, "num_input_tokens_seen": 397390944, "router_z_loss_mlp": 0.0880127, "routerloss_mlp": 0.0, "step": 4804, "time_per_iteration": 2.8761777877807617 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050005, "balance_loss_mlp": 1.04128492, "diversity_loss_mlp": 0.0, "epoch": 0.9243939976914197, "flos": 729430354944.0, "grad_norm": 0.07337139477875909, "language_loss": 0.79396987, "learning_rate": 1.4915848868484016e-05, "loss": 0.80446994, "num_input_tokens_seen": 397468128, "router_z_loss_mlp": 0.08721924, "routerloss_mlp": 0.0, "step": 4805, "time_per_iteration": 2.9650769233703613 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046276, "balance_loss_mlp": 1.03742468, "diversity_loss_mlp": 0.0, "epoch": 0.9245863793766833, "flos": 452246229504.0, "grad_norm": 0.07187105546875673, "language_loss": 0.90605378, "learning_rate": 1.4840415088763048e-05, "loss": 0.91651654, "num_input_tokens_seen": 397538976, "router_z_loss_mlp": 0.08850098, "routerloss_mlp": 0.0, "step": 4806, "time_per_iteration": 2.6060450077056885 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00788148, "balance_loss_mlp": 1.33026791, "diversity_loss_mlp": 0.22471815, "epoch": 0.9247787610619469, "flos": 755030605824.0, "grad_norm": 0.03522090358058462, "language_loss": 0.77311206, "learning_rate": 1.476516966469732e-05, "loss": 0.78099358, "num_input_tokens_seen": 397612944, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01065494, "step": 4807, "time_per_iteration": 2.9656925201416016 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047033, "balance_loss_mlp": 1.03775859, "diversity_loss_mlp": 0.0, "epoch": 0.9249711427472105, "flos": 561928389120.0, "grad_norm": 0.05970940147953983, "language_loss": 0.85029161, "learning_rate": 1.4690112625499908e-05, "loss": 0.860762, "num_input_tokens_seen": 397690848, "router_z_loss_mlp": 0.0927124, "routerloss_mlp": 0.0, "step": 4808, "time_per_iteration": 2.730725049972534 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045322, "balance_loss_mlp": 1.0360359, "diversity_loss_mlp": 0.0, "epoch": 0.9251635244324741, "flos": 526699459584.0, "grad_norm": 0.07434097229920794, "language_loss": 0.85175872, "learning_rate": 1.4615244000310501e-05, "loss": 0.86221194, "num_input_tokens_seen": 397761008, "router_z_loss_mlp": 0.09283447, "routerloss_mlp": 0.0, "step": 4809, "time_per_iteration": 2.677678346633911 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047523, "balance_loss_mlp": 1.03848672, "diversity_loss_mlp": 0.0, "epoch": 0.9253559061177375, "flos": 611280608256.0, "grad_norm": 0.06773039177733224, "language_loss": 0.79278344, "learning_rate": 1.4540563818195685e-05, "loss": 0.80325866, "num_input_tokens_seen": 397840640, "router_z_loss_mlp": 0.09039307, "routerloss_mlp": 0.0, "step": 4810, "time_per_iteration": 2.7994203567504883 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01003108, "balance_loss_mlp": 0.99864995, "diversity_loss_mlp": 0.0, "epoch": 0.9255482878030011, "flos": 1551258957312.0, "grad_norm": 0.003310724835280569, "language_loss": 0.76925391, "learning_rate": 1.446607210814882e-05, "loss": 0.77928501, "num_input_tokens_seen": 398060096, "router_z_loss_mlp": 0.04467773, "routerloss_mlp": 0.0, "step": 4811, "time_per_iteration": 4.716477394104004 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01053397, "balance_loss_mlp": 1.04443264, "diversity_loss_mlp": 0.0, "epoch": 0.9257406694882647, "flos": 766366603776.0, "grad_norm": 0.07798685492020957, "language_loss": 0.80983555, "learning_rate": 1.4391768899090219e-05, "loss": 0.82036948, "num_input_tokens_seen": 398143680, "router_z_loss_mlp": 0.08972168, "routerloss_mlp": 0.0, "step": 4812, "time_per_iteration": 3.111435651779175 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046457, "balance_loss_mlp": 1.03743255, "diversity_loss_mlp": 0.0, "epoch": 0.9259330511735283, "flos": 497991089664.0, "grad_norm": 0.07891038810151499, "language_loss": 0.83191156, "learning_rate": 1.431765421986686e-05, "loss": 0.84237611, "num_input_tokens_seen": 398207056, "router_z_loss_mlp": 0.09020996, "routerloss_mlp": 0.0, "step": 4813, "time_per_iteration": 2.5696511268615723 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049476, "balance_loss_mlp": 1.04083896, "diversity_loss_mlp": 0.0, "epoch": 0.9261254328587919, "flos": 626874080256.0, "grad_norm": 0.06938826271777476, "language_loss": 0.79197675, "learning_rate": 1.424372809925273e-05, "loss": 0.80247152, "num_input_tokens_seen": 398277472, "router_z_loss_mlp": 0.08642578, "routerloss_mlp": 0.0, "step": 4814, "time_per_iteration": 2.716487407684326 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047968, "balance_loss_mlp": 1.03926587, "diversity_loss_mlp": 0.0, "epoch": 0.9263178145440554, "flos": 597382345728.0, "grad_norm": 0.06659923130000121, "language_loss": 0.8535648, "learning_rate": 1.416999056594831e-05, "loss": 0.86404449, "num_input_tokens_seen": 398346544, "router_z_loss_mlp": 0.08703613, "routerloss_mlp": 0.0, "step": 4815, "time_per_iteration": 2.7244887351989746 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050391, "balance_loss_mlp": 1.0416646, "diversity_loss_mlp": 0.0, "epoch": 0.926510196229319, "flos": 388563319296.0, "grad_norm": 0.06890226138960381, "language_loss": 0.83825701, "learning_rate": 1.4096441648581259e-05, "loss": 0.84876096, "num_input_tokens_seen": 398409344, "router_z_loss_mlp": 0.08734131, "routerloss_mlp": 0.0, "step": 4816, "time_per_iteration": 2.464979887008667 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048928, "balance_loss_mlp": 1.04029167, "diversity_loss_mlp": 0.0, "epoch": 0.9267025779145825, "flos": 545798974464.0, "grad_norm": 0.07919281923401009, "language_loss": 0.84257257, "learning_rate": 1.4023081375705737e-05, "loss": 0.85306185, "num_input_tokens_seen": 398478816, "router_z_loss_mlp": 0.08648682, "routerloss_mlp": 0.0, "step": 4817, "time_per_iteration": 2.640580415725708 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047669, "balance_loss_mlp": 1.03899682, "diversity_loss_mlp": 0.0, "epoch": 0.9268949595998461, "flos": 499789813248.0, "grad_norm": 0.06905431252215245, "language_loss": 0.82030249, "learning_rate": 1.3949909775802682e-05, "loss": 0.83077914, "num_input_tokens_seen": 398550384, "router_z_loss_mlp": 0.08679199, "routerloss_mlp": 0.0, "step": 4818, "time_per_iteration": 2.6683123111724854 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104897, "balance_loss_mlp": 1.04013073, "diversity_loss_mlp": 0.0, "epoch": 0.9270873412851096, "flos": 432828085248.0, "grad_norm": 0.06364347314694363, "language_loss": 0.82941604, "learning_rate": 1.3876926877279817e-05, "loss": 0.8399058, "num_input_tokens_seen": 398620832, "router_z_loss_mlp": 0.08837891, "routerloss_mlp": 0.0, "step": 4819, "time_per_iteration": 2.622507333755493 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047498, "balance_loss_mlp": 1.03880203, "diversity_loss_mlp": 0.0, "epoch": 0.9272797229703732, "flos": 466769640960.0, "grad_norm": 0.07369631813155064, "language_loss": 0.8604511, "learning_rate": 1.380413270847164e-05, "loss": 0.87092614, "num_input_tokens_seen": 398689776, "router_z_loss_mlp": 0.08703613, "routerloss_mlp": 0.0, "step": 4820, "time_per_iteration": 2.5886447429656982 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01042961, "balance_loss_mlp": 1.03391302, "diversity_loss_mlp": 0.0, "epoch": 0.9274721046556368, "flos": 704838122496.0, "grad_norm": 0.06986061953541225, "language_loss": 0.78981894, "learning_rate": 1.373152729763938e-05, "loss": 0.80024862, "num_input_tokens_seen": 398775072, "router_z_loss_mlp": 0.09051514, "routerloss_mlp": 0.0, "step": 4821, "time_per_iteration": 3.002431869506836 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0100315, "balance_loss_mlp": 0.99869162, "diversity_loss_mlp": 0.0, "epoch": 0.9276644863409004, "flos": 1402255950336.0, "grad_norm": 0.0033138689547235365, "language_loss": 0.82380462, "learning_rate": 1.3659110672970931e-05, "loss": 0.83383614, "num_input_tokens_seen": 399002016, "router_z_loss_mlp": 0.04467773, "routerloss_mlp": 0.0, "step": 4822, "time_per_iteration": 4.872236728668213 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048216, "balance_loss_mlp": 1.03961504, "diversity_loss_mlp": 0.0, "epoch": 0.927856868026164, "flos": 741722614272.0, "grad_norm": 0.10753003885480804, "language_loss": 0.80162168, "learning_rate": 1.3586882862580917e-05, "loss": 0.81210387, "num_input_tokens_seen": 399085808, "router_z_loss_mlp": 0.08612061, "routerloss_mlp": 0.0, "step": 4823, "time_per_iteration": 3.065385103225708 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010481, "balance_loss_mlp": 1.03920078, "diversity_loss_mlp": 0.0, "epoch": 0.9280492497114274, "flos": 412223883264.0, "grad_norm": 0.07544984559040653, "language_loss": 0.74334532, "learning_rate": 1.3514843894510686e-05, "loss": 0.75382626, "num_input_tokens_seen": 399146768, "router_z_loss_mlp": 0.08898926, "routerloss_mlp": 0.0, "step": 4824, "time_per_iteration": 2.459182024002075 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045084, "balance_loss_mlp": 1.03613138, "diversity_loss_mlp": 0.0, "epoch": 0.928241631396691, "flos": 646504768512.0, "grad_norm": 0.1022591189326798, "language_loss": 0.84062541, "learning_rate": 1.3442993796728254e-05, "loss": 0.85107625, "num_input_tokens_seen": 399220192, "router_z_loss_mlp": 0.08953857, "routerloss_mlp": 0.0, "step": 4825, "time_per_iteration": 2.7902333736419678 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104756, "balance_loss_mlp": 1.03844011, "diversity_loss_mlp": 0.0, "epoch": 0.9284340130819546, "flos": 696855094272.0, "grad_norm": 0.06332347540086566, "language_loss": 0.80870605, "learning_rate": 1.3371332597128249e-05, "loss": 0.81918162, "num_input_tokens_seen": 399300064, "router_z_loss_mlp": 0.09118652, "routerloss_mlp": 0.0, "step": 4826, "time_per_iteration": 3.014462947845459 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0078881, "balance_loss_mlp": 1.33157349, "diversity_loss_mlp": 0.22439189, "epoch": 0.9286263947672182, "flos": 759132062208.0, "grad_norm": 0.028742947039502215, "language_loss": 0.83905512, "learning_rate": 1.3299860323532032e-05, "loss": 0.84694326, "num_input_tokens_seen": 399383200, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01082761, "step": 4827, "time_per_iteration": 3.0634989738464355 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046746, "balance_loss_mlp": 1.03804338, "diversity_loss_mlp": 0.0, "epoch": 0.9288187764524817, "flos": 672823770624.0, "grad_norm": 0.07468304915568001, "language_loss": 0.80064201, "learning_rate": 1.3228577003687681e-05, "loss": 0.81110942, "num_input_tokens_seen": 399466400, "router_z_loss_mlp": 0.0871582, "routerloss_mlp": 0.0, "step": 4828, "time_per_iteration": 2.9195716381073 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104862, "balance_loss_mlp": 1.03953636, "diversity_loss_mlp": 0.0, "epoch": 0.9290111581377453, "flos": 500469290496.0, "grad_norm": 0.06920378526179259, "language_loss": 0.83656001, "learning_rate": 1.3157482665269727e-05, "loss": 0.84704626, "num_input_tokens_seen": 399533504, "router_z_loss_mlp": 0.09082031, "routerloss_mlp": 0.0, "step": 4829, "time_per_iteration": 2.5818231105804443 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0100325, "balance_loss_mlp": 0.99879169, "diversity_loss_mlp": 0.0, "epoch": 0.9292035398230089, "flos": 1563627566592.0, "grad_norm": 0.0032198614954978416, "language_loss": 0.72122061, "learning_rate": 1.3086577335879424e-05, "loss": 0.73125315, "num_input_tokens_seen": 399769872, "router_z_loss_mlp": 0.04467773, "routerloss_mlp": 0.0, "step": 4830, "time_per_iteration": 4.951828718185425 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01003246, "balance_loss_mlp": 0.99878782, "diversity_loss_mlp": 0.0, "epoch": 0.9293959215082724, "flos": 1518673411584.0, "grad_norm": 0.003220380799395436, "language_loss": 0.79511833, "learning_rate": 1.3015861043044753e-05, "loss": 0.80515087, "num_input_tokens_seen": 399997760, "router_z_loss_mlp": 0.04467773, "routerloss_mlp": 0.0, "step": 4831, "time_per_iteration": 4.905702590942383 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105208, "balance_loss_mlp": 1.04304385, "diversity_loss_mlp": 0.0, "epoch": 0.929588303193536, "flos": 557836844544.0, "grad_norm": 0.08579455116544467, "language_loss": 0.84383392, "learning_rate": 1.2945333814220195e-05, "loss": 0.85435468, "num_input_tokens_seen": 400063872, "router_z_loss_mlp": 0.09039307, "routerloss_mlp": 0.0, "step": 4832, "time_per_iteration": 2.6667189598083496 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051534, "balance_loss_mlp": 1.04263496, "diversity_loss_mlp": 0.0, "epoch": 0.9297806848787995, "flos": 478580285952.0, "grad_norm": 0.07653793753230506, "language_loss": 0.80192435, "learning_rate": 1.2874995676786905e-05, "loss": 0.81243968, "num_input_tokens_seen": 400126064, "router_z_loss_mlp": 0.08898926, "routerloss_mlp": 0.0, "step": 4833, "time_per_iteration": 2.530576705932617 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00784425, "balance_loss_mlp": 1.32099247, "diversity_loss_mlp": 0.22666103, "epoch": 0.9299730665640631, "flos": 564537641472.0, "grad_norm": 0.02823635345590092, "language_loss": 0.80189478, "learning_rate": 1.2804846658052372e-05, "loss": 0.80973905, "num_input_tokens_seen": 400201776, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01059832, "step": 4834, "time_per_iteration": 2.8291900157928467 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046644, "balance_loss_mlp": 1.03810263, "diversity_loss_mlp": 0.0, "epoch": 0.9301654482493267, "flos": 560174082048.0, "grad_norm": 0.06755490191164544, "language_loss": 0.82792151, "learning_rate": 1.2734886785251032e-05, "loss": 0.83838797, "num_input_tokens_seen": 400279504, "router_z_loss_mlp": 0.08551025, "routerloss_mlp": 0.0, "step": 4835, "time_per_iteration": 2.823146104812622 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01003179, "balance_loss_mlp": 0.99874461, "diversity_loss_mlp": 0.0, "epoch": 0.9303578299345903, "flos": 1520096606208.0, "grad_norm": 0.0032138775564420387, "language_loss": 0.76852441, "learning_rate": 1.2665116085543715e-05, "loss": 0.77855623, "num_input_tokens_seen": 400514800, "router_z_loss_mlp": 0.04443359, "routerloss_mlp": 0.0, "step": 4836, "time_per_iteration": 4.9668896198272705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049636, "balance_loss_mlp": 1.04090357, "diversity_loss_mlp": 0.0, "epoch": 0.9305502116198537, "flos": 530843134464.0, "grad_norm": 0.07503406646188047, "language_loss": 0.82993883, "learning_rate": 1.2595534586017698e-05, "loss": 0.84043521, "num_input_tokens_seen": 400582640, "router_z_loss_mlp": 0.08734131, "routerloss_mlp": 0.0, "step": 4837, "time_per_iteration": 2.637373924255371 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045053, "balance_loss_mlp": 1.03608274, "diversity_loss_mlp": 0.0, "epoch": 0.9307425933051173, "flos": 474898775040.0, "grad_norm": 0.08374095917705242, "language_loss": 0.81554383, "learning_rate": 1.2526142313686983e-05, "loss": 0.82599437, "num_input_tokens_seen": 400646912, "router_z_loss_mlp": 0.08978271, "routerloss_mlp": 0.0, "step": 4838, "time_per_iteration": 2.515183448791504 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00784124, "balance_loss_mlp": 1.32047153, "diversity_loss_mlp": 0.22594652, "epoch": 0.9309349749903809, "flos": 584892223488.0, "grad_norm": 0.03184031575728359, "language_loss": 0.86872089, "learning_rate": 1.245693929549213e-05, "loss": 0.87656212, "num_input_tokens_seen": 400722128, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01091547, "step": 4839, "time_per_iteration": 2.7616403102874756 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047731, "balance_loss_mlp": 1.03896284, "diversity_loss_mlp": 0.0, "epoch": 0.9311273566756445, "flos": 861666315264.0, "grad_norm": 0.061490618450412385, "language_loss": 0.76999998, "learning_rate": 1.2387925558299984e-05, "loss": 0.78047729, "num_input_tokens_seen": 400801440, "router_z_loss_mlp": 0.08776855, "routerloss_mlp": 0.0, "step": 4840, "time_per_iteration": 3.0911495685577393 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049189, "balance_loss_mlp": 1.04037976, "diversity_loss_mlp": 0.0, "epoch": 0.9313197383609081, "flos": 548094366720.0, "grad_norm": 0.07195558921256455, "language_loss": 0.82423127, "learning_rate": 1.231910112890411e-05, "loss": 0.83472311, "num_input_tokens_seen": 400873008, "router_z_loss_mlp": 0.0881958, "routerloss_mlp": 0.0, "step": 4841, "time_per_iteration": 2.6239800453186035 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010463, "balance_loss_mlp": 1.0373354, "diversity_loss_mlp": 0.0, "epoch": 0.9315121200461716, "flos": 468756315648.0, "grad_norm": 0.07717139537202818, "language_loss": 0.81388533, "learning_rate": 1.2250466034024522e-05, "loss": 0.82434833, "num_input_tokens_seen": 400935328, "router_z_loss_mlp": 0.08972168, "routerloss_mlp": 0.0, "step": 4842, "time_per_iteration": 2.5533297061920166 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104779, "balance_loss_mlp": 1.03906965, "diversity_loss_mlp": 0.0, "epoch": 0.9317045017314352, "flos": 417659701248.0, "grad_norm": 0.07073553761883396, "language_loss": 0.77673644, "learning_rate": 1.2182020300307684e-05, "loss": 0.78721428, "num_input_tokens_seen": 401000720, "router_z_loss_mlp": 0.08734131, "routerloss_mlp": 0.0, "step": 4843, "time_per_iteration": 2.528705358505249 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046518, "balance_loss_mlp": 1.03777993, "diversity_loss_mlp": 0.0, "epoch": 0.9318968834166987, "flos": 540489065472.0, "grad_norm": 0.06887316839423005, "language_loss": 0.7711761, "learning_rate": 1.2113763954326729e-05, "loss": 0.78164124, "num_input_tokens_seen": 401079664, "router_z_loss_mlp": 0.08752441, "routerloss_mlp": 0.0, "step": 4844, "time_per_iteration": 2.7841336727142334 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047669, "balance_loss_mlp": 1.0387938, "diversity_loss_mlp": 0.0, "epoch": 0.9320892651019623, "flos": 521330452992.0, "grad_norm": 0.07471339735643584, "language_loss": 0.80957037, "learning_rate": 1.2045697022581015e-05, "loss": 0.82004702, "num_input_tokens_seen": 401146160, "router_z_loss_mlp": 0.08880615, "routerloss_mlp": 0.0, "step": 4845, "time_per_iteration": 2.5967259407043457 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047686, "balance_loss_mlp": 1.03901899, "diversity_loss_mlp": 0.0, "epoch": 0.9322816467872258, "flos": 582072998400.0, "grad_norm": 0.0577436249864269, "language_loss": 0.80821908, "learning_rate": 1.1977819531496348e-05, "loss": 0.8186959, "num_input_tokens_seen": 401223264, "router_z_loss_mlp": 0.08673096, "routerloss_mlp": 0.0, "step": 4846, "time_per_iteration": 2.735156774520874 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00780467, "balance_loss_mlp": 1.31358051, "diversity_loss_mlp": 0.22594975, "epoch": 0.9324740284724894, "flos": 484747338240.0, "grad_norm": 0.03394753668394222, "language_loss": 0.82436854, "learning_rate": 1.191013150742537e-05, "loss": 0.83217323, "num_input_tokens_seen": 401296368, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01070218, "step": 4847, "time_per_iteration": 2.730957269668579 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047642, "balance_loss_mlp": 1.03871894, "diversity_loss_mlp": 0.0, "epoch": 0.932666410157753, "flos": 732585461760.0, "grad_norm": 0.06722310118133415, "language_loss": 0.82897216, "learning_rate": 1.1842632976646672e-05, "loss": 0.83944857, "num_input_tokens_seen": 401383936, "router_z_loss_mlp": 0.08929443, "routerloss_mlp": 0.0, "step": 4848, "time_per_iteration": 3.0189881324768066 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044207, "balance_loss_mlp": 1.03535616, "diversity_loss_mlp": 0.0, "epoch": 0.9328587918430166, "flos": 965537127936.0, "grad_norm": 0.08276324861402574, "language_loss": 0.78624225, "learning_rate": 1.1775323965365681e-05, "loss": 0.79668438, "num_input_tokens_seen": 401468784, "router_z_loss_mlp": 0.08856201, "routerloss_mlp": 0.0, "step": 4849, "time_per_iteration": 3.2938950061798096 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043788, "balance_loss_mlp": 1.03470397, "diversity_loss_mlp": 0.0, "epoch": 0.9330511735282802, "flos": 614552085504.0, "grad_norm": 0.07019081687121781, "language_loss": 0.80391824, "learning_rate": 1.1708204499713936e-05, "loss": 0.81435609, "num_input_tokens_seen": 401539712, "router_z_loss_mlp": 0.09088135, "routerloss_mlp": 0.0, "step": 4850, "time_per_iteration": 2.7515499591827393 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047473, "balance_loss_mlp": 1.03849709, "diversity_loss_mlp": 0.0, "epoch": 0.9332435552135436, "flos": 559101823488.0, "grad_norm": 0.06820253841014733, "language_loss": 0.85668182, "learning_rate": 1.1641274605749653e-05, "loss": 0.86715662, "num_input_tokens_seen": 401610432, "router_z_loss_mlp": 0.08978271, "routerloss_mlp": 0.0, "step": 4851, "time_per_iteration": 2.680340528488159 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047775, "balance_loss_mlp": 1.03895366, "diversity_loss_mlp": 0.0, "epoch": 0.9334359368988072, "flos": 515536358400.0, "grad_norm": 0.1196628498062152, "language_loss": 0.8199991, "learning_rate": 1.1574534309457208e-05, "loss": 0.83047688, "num_input_tokens_seen": 401677344, "router_z_loss_mlp": 0.08831787, "routerloss_mlp": 0.0, "step": 4852, "time_per_iteration": 2.5966830253601074 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045996, "balance_loss_mlp": 1.03706086, "diversity_loss_mlp": 0.0, "epoch": 0.9336283185840708, "flos": 539809588224.0, "grad_norm": 0.07419739239105261, "language_loss": 0.82826304, "learning_rate": 1.1507983636747488e-05, "loss": 0.838723, "num_input_tokens_seen": 401756864, "router_z_loss_mlp": 0.08947754, "routerloss_mlp": 0.0, "step": 4853, "time_per_iteration": 2.7714791297912598 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0100356, "balance_loss_mlp": 0.999125, "diversity_loss_mlp": 0.0, "epoch": 0.9338207002693344, "flos": 1562824751616.0, "grad_norm": 0.004307105036144614, "language_loss": 0.78455019, "learning_rate": 1.1441622613457824e-05, "loss": 0.7945857, "num_input_tokens_seen": 401983664, "router_z_loss_mlp": 0.04443359, "routerloss_mlp": 0.0, "step": 4854, "time_per_iteration": 4.893805265426636 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046802, "balance_loss_mlp": 1.03777242, "diversity_loss_mlp": 0.0, "epoch": 0.9340130819545979, "flos": 645261811200.0, "grad_norm": 0.06988266936343929, "language_loss": 0.81466687, "learning_rate": 1.1375451265351833e-05, "loss": 0.82513487, "num_input_tokens_seen": 402065744, "router_z_loss_mlp": 0.090271, "routerloss_mlp": 0.0, "step": 4855, "time_per_iteration": 2.9019949436187744 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046813, "balance_loss_mlp": 1.03802776, "diversity_loss_mlp": 0.0, "epoch": 0.9342054636398615, "flos": 503441588736.0, "grad_norm": 0.06582390304127933, "language_loss": 0.76894152, "learning_rate": 1.1309469618119516e-05, "loss": 0.77940965, "num_input_tokens_seen": 402137728, "router_z_loss_mlp": 0.08795166, "routerloss_mlp": 0.0, "step": 4856, "time_per_iteration": 2.650545597076416 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049411, "balance_loss_mlp": 1.04064322, "diversity_loss_mlp": 0.0, "epoch": 0.934397845325125, "flos": 593026126848.0, "grad_norm": 0.0537499767930613, "language_loss": 0.84482789, "learning_rate": 1.1243677697377109e-05, "loss": 0.855322, "num_input_tokens_seen": 402220160, "router_z_loss_mlp": 0.08776855, "routerloss_mlp": 0.0, "step": 4857, "time_per_iteration": 2.82725191116333 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044882, "balance_loss_mlp": 1.03607237, "diversity_loss_mlp": 0.0, "epoch": 0.9345902270103886, "flos": 499891129344.0, "grad_norm": 0.0729144221953202, "language_loss": 0.80315518, "learning_rate": 1.1178075528667453e-05, "loss": 0.813604, "num_input_tokens_seen": 402285168, "router_z_loss_mlp": 0.0881958, "routerloss_mlp": 0.0, "step": 4858, "time_per_iteration": 2.575934410095215 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00589881, "balance_loss_mlp": 1.02917051, "diversity_loss_mlp": 0.13201948, "epoch": 0.9347826086956522, "flos": 1520329347072.0, "grad_norm": 0.001270733186004784, "language_loss": 0.7598772, "learning_rate": 1.1112663137459566e-05, "loss": 0.76577604, "num_input_tokens_seen": 402504912, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.00928602, "step": 4859, "time_per_iteration": 4.699007987976074 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043954, "balance_loss_mlp": 1.03486431, "diversity_loss_mlp": 0.0, "epoch": 0.9349749903809157, "flos": 504550923264.0, "grad_norm": 0.05691745976231031, "language_loss": 0.81198478, "learning_rate": 1.1047440549148636e-05, "loss": 0.82242435, "num_input_tokens_seen": 402582032, "router_z_loss_mlp": 0.09088135, "routerloss_mlp": 0.0, "step": 4860, "time_per_iteration": 2.777012825012207 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043681, "balance_loss_mlp": 1.0347048, "diversity_loss_mlp": 0.0, "epoch": 0.9351673720661793, "flos": 568901200896.0, "grad_norm": 0.10010618557822787, "language_loss": 0.79151934, "learning_rate": 1.0982407789056514e-05, "loss": 0.80195618, "num_input_tokens_seen": 402650144, "router_z_loss_mlp": 0.08984375, "routerloss_mlp": 0.0, "step": 4861, "time_per_iteration": 2.6320016384124756 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044691, "balance_loss_mlp": 1.03600073, "diversity_loss_mlp": 0.0, "epoch": 0.9353597537514429, "flos": 544605576192.0, "grad_norm": 0.08362946312424821, "language_loss": 0.86286509, "learning_rate": 1.0917564882430952e-05, "loss": 0.87331206, "num_input_tokens_seen": 402720368, "router_z_loss_mlp": 0.08703613, "routerloss_mlp": 0.0, "step": 4862, "time_per_iteration": 2.6105833053588867 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044576, "balance_loss_mlp": 1.03586817, "diversity_loss_mlp": 0.0, "epoch": 0.9355521354367065, "flos": 518997984768.0, "grad_norm": 0.05900589694062164, "language_loss": 0.84758484, "learning_rate": 1.0852911854446368e-05, "loss": 0.85803056, "num_input_tokens_seen": 402795568, "router_z_loss_mlp": 0.0871582, "routerloss_mlp": 0.0, "step": 4863, "time_per_iteration": 2.7426371574401855 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045058, "balance_loss_mlp": 1.03628969, "diversity_loss_mlp": 0.0, "epoch": 0.93574451712197, "flos": 446316314112.0, "grad_norm": 0.0932071553441471, "language_loss": 0.78725177, "learning_rate": 1.0788448730203237e-05, "loss": 0.79770231, "num_input_tokens_seen": 402858784, "router_z_loss_mlp": 0.08776855, "routerloss_mlp": 0.0, "step": 4864, "time_per_iteration": 2.5507235527038574 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045394, "balance_loss_mlp": 1.0366559, "diversity_loss_mlp": 0.0, "epoch": 0.9359368988072335, "flos": 480517401600.0, "grad_norm": 0.08522352532070332, "language_loss": 0.76506388, "learning_rate": 1.072417553472832e-05, "loss": 0.77551782, "num_input_tokens_seen": 402924144, "router_z_loss_mlp": 0.08746338, "routerloss_mlp": 0.0, "step": 4865, "time_per_iteration": 4.053428649902344 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045737, "balance_loss_mlp": 1.03688622, "diversity_loss_mlp": 0.0, "epoch": 0.9361292804924971, "flos": 497118892032.0, "grad_norm": 0.06592512300053538, "language_loss": 0.85022455, "learning_rate": 1.0660092292974766e-05, "loss": 0.86068201, "num_input_tokens_seen": 402987488, "router_z_loss_mlp": 0.08862305, "routerloss_mlp": 0.0, "step": 4866, "time_per_iteration": 2.608532667160034 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045045, "balance_loss_mlp": 1.03633118, "diversity_loss_mlp": 0.0, "epoch": 0.9363216621777607, "flos": 618122368512.0, "grad_norm": 0.08990017203823457, "language_loss": 0.84334439, "learning_rate": 1.059619902982184e-05, "loss": 0.85379487, "num_input_tokens_seen": 403058224, "router_z_loss_mlp": 0.08721924, "routerloss_mlp": 0.0, "step": 4867, "time_per_iteration": 2.7455151081085205 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01003805, "balance_loss_mlp": 0.99937075, "diversity_loss_mlp": 0.0, "epoch": 0.9365140438630243, "flos": 1415929559040.0, "grad_norm": 0.005040674101907188, "language_loss": 0.79203337, "learning_rate": 1.053249577007509e-05, "loss": 0.80207145, "num_input_tokens_seen": 403289072, "router_z_loss_mlp": 0.04443359, "routerloss_mlp": 0.0, "step": 4868, "time_per_iteration": 4.876135587692261 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043847, "balance_loss_mlp": 1.03491819, "diversity_loss_mlp": 0.0, "epoch": 0.9367064255482878, "flos": 590503509504.0, "grad_norm": 0.07053266752313711, "language_loss": 0.81646079, "learning_rate": 1.0468982538466287e-05, "loss": 0.82689929, "num_input_tokens_seen": 403361728, "router_z_loss_mlp": 0.08929443, "routerloss_mlp": 0.0, "step": 4869, "time_per_iteration": 2.708939790725708 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046683, "balance_loss_mlp": 1.03781986, "diversity_loss_mlp": 0.0, "epoch": 0.9368988072335513, "flos": 526637790720.0, "grad_norm": 0.060976282943095796, "language_loss": 0.82172537, "learning_rate": 1.0405659359653597e-05, "loss": 0.83219218, "num_input_tokens_seen": 403431536, "router_z_loss_mlp": 0.08874512, "routerloss_mlp": 0.0, "step": 4870, "time_per_iteration": 2.65925669670105 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010449, "balance_loss_mlp": 1.03607297, "diversity_loss_mlp": 0.0, "epoch": 0.9370911889188149, "flos": 743205279744.0, "grad_norm": 0.062164083958686674, "language_loss": 0.78947985, "learning_rate": 1.034252625822113e-05, "loss": 0.79992884, "num_input_tokens_seen": 403504768, "router_z_loss_mlp": 0.08843994, "routerloss_mlp": 0.0, "step": 4871, "time_per_iteration": 2.9242799282073975 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01040518, "balance_loss_mlp": 1.03191113, "diversity_loss_mlp": 0.0, "epoch": 0.9372835706040785, "flos": 546038682624.0, "grad_norm": 0.06036408822352837, "language_loss": 0.78672194, "learning_rate": 1.0279583258679448e-05, "loss": 0.79712713, "num_input_tokens_seen": 403575584, "router_z_loss_mlp": 0.08612061, "routerloss_mlp": 0.0, "step": 4872, "time_per_iteration": 2.7019548416137695 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044427, "balance_loss_mlp": 1.03556955, "diversity_loss_mlp": 0.0, "epoch": 0.9374759522893421, "flos": 491633515008.0, "grad_norm": 0.0656254889693481, "language_loss": 0.81680477, "learning_rate": 1.0216830385465003e-05, "loss": 0.82724905, "num_input_tokens_seen": 403648720, "router_z_loss_mlp": 0.08862305, "routerloss_mlp": 0.0, "step": 4873, "time_per_iteration": 2.6661787033081055 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104974, "balance_loss_mlp": 1.04079902, "diversity_loss_mlp": 0.0, "epoch": 0.9376683339746056, "flos": 578421222912.0, "grad_norm": 0.07062356836033176, "language_loss": 0.82414687, "learning_rate": 1.0154267662940809e-05, "loss": 0.83464432, "num_input_tokens_seen": 403721392, "router_z_loss_mlp": 0.08953857, "routerloss_mlp": 0.0, "step": 4874, "time_per_iteration": 2.711991310119629 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047638, "balance_loss_mlp": 1.03853059, "diversity_loss_mlp": 0.0, "epoch": 0.9378607156598692, "flos": 506290549248.0, "grad_norm": 0.07310284560827243, "language_loss": 0.80373824, "learning_rate": 1.0091895115395766e-05, "loss": 0.81421459, "num_input_tokens_seen": 403792112, "router_z_loss_mlp": 0.09100342, "routerloss_mlp": 0.0, "step": 4875, "time_per_iteration": 2.650681972503662 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046316, "balance_loss_mlp": 1.03720248, "diversity_loss_mlp": 0.0, "epoch": 0.9380530973451328, "flos": 520015915008.0, "grad_norm": 0.062293314386374414, "language_loss": 0.77575111, "learning_rate": 1.0029712767045062e-05, "loss": 0.78621429, "num_input_tokens_seen": 403860928, "router_z_loss_mlp": 0.09112549, "routerloss_mlp": 0.0, "step": 4876, "time_per_iteration": 2.6609630584716797 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046448, "balance_loss_mlp": 1.03741789, "diversity_loss_mlp": 0.0, "epoch": 0.9382454790303963, "flos": 557799768576.0, "grad_norm": 0.06315414550541629, "language_loss": 0.84719789, "learning_rate": 9.967720642029999e-06, "loss": 0.85766232, "num_input_tokens_seen": 403928240, "router_z_loss_mlp": 0.09033203, "routerloss_mlp": 0.0, "step": 4877, "time_per_iteration": 2.651707172393799 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045511, "balance_loss_mlp": 1.03690422, "diversity_loss_mlp": 0.0, "epoch": 0.9384378607156598, "flos": 695476316160.0, "grad_norm": 0.0631685338403412, "language_loss": 0.81854308, "learning_rate": 9.905918764418153e-06, "loss": 0.82899821, "num_input_tokens_seen": 404004320, "router_z_loss_mlp": 0.08618164, "routerloss_mlp": 0.0, "step": 4878, "time_per_iteration": 2.949418783187866 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049635, "balance_loss_mlp": 1.04068828, "diversity_loss_mlp": 0.0, "epoch": 0.9386302424009234, "flos": 554750747136.0, "grad_norm": 0.08565110846317762, "language_loss": 0.80980134, "learning_rate": 9.844307158203058e-06, "loss": 0.82029772, "num_input_tokens_seen": 404077040, "router_z_loss_mlp": 0.08959961, "routerloss_mlp": 0.0, "step": 4879, "time_per_iteration": 2.6912460327148438 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048881, "balance_loss_mlp": 1.03982735, "diversity_loss_mlp": 0.0, "epoch": 0.938822624086187, "flos": 566981337600.0, "grad_norm": 0.0804374374941349, "language_loss": 0.79621142, "learning_rate": 9.782885847304469e-06, "loss": 0.80670023, "num_input_tokens_seen": 404145248, "router_z_loss_mlp": 0.09057617, "routerloss_mlp": 0.0, "step": 4880, "time_per_iteration": 2.6459033489227295 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045381, "balance_loss_mlp": 1.03668451, "diversity_loss_mlp": 0.0, "epoch": 0.9390150057714506, "flos": 417602801664.0, "grad_norm": 0.07482420746454603, "language_loss": 0.80257022, "learning_rate": 9.721654855568196e-06, "loss": 0.81302404, "num_input_tokens_seen": 404212000, "router_z_loss_mlp": 0.08703613, "routerloss_mlp": 0.0, "step": 4881, "time_per_iteration": 2.5740063190460205 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046316, "balance_loss_mlp": 1.03760171, "diversity_loss_mlp": 0.0, "epoch": 0.9392073874567142, "flos": 1553839967232.0, "grad_norm": 0.0852712224295467, "language_loss": 0.76510745, "learning_rate": 9.660614206766394e-06, "loss": 0.77557057, "num_input_tokens_seen": 404305408, "router_z_loss_mlp": 0.08721924, "routerloss_mlp": 0.0, "step": 4882, "time_per_iteration": 3.689307689666748 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050073, "balance_loss_mlp": 1.04114449, "diversity_loss_mlp": 0.0, "epoch": 0.9393997691419776, "flos": 652536000000.0, "grad_norm": 0.09232552056587429, "language_loss": 0.7808578, "learning_rate": 9.59976392459705e-06, "loss": 0.79135859, "num_input_tokens_seen": 404383248, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 4883, "time_per_iteration": 2.7796614170074463 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01004059, "balance_loss_mlp": 0.99962485, "diversity_loss_mlp": 0.0, "epoch": 0.9395921508272412, "flos": 1553294817792.0, "grad_norm": 0.004454986396057403, "language_loss": 0.78170681, "learning_rate": 9.539104032684209e-06, "loss": 0.79174733, "num_input_tokens_seen": 404615264, "router_z_loss_mlp": 0.04443359, "routerloss_mlp": 0.0, "step": 4884, "time_per_iteration": 4.815665245056152 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049205, "balance_loss_mlp": 1.04029393, "diversity_loss_mlp": 0.0, "epoch": 0.9397845325125048, "flos": 498144162816.0, "grad_norm": 0.06863865940742271, "language_loss": 0.78660077, "learning_rate": 9.478634554578314e-06, "loss": 0.79709285, "num_input_tokens_seen": 404684656, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 4885, "time_per_iteration": 2.6168384552001953 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104537, "balance_loss_mlp": 1.03678083, "diversity_loss_mlp": 0.0, "epoch": 0.9399769141977684, "flos": 498596414976.0, "grad_norm": 0.07504646640886149, "language_loss": 0.83853602, "learning_rate": 9.418355513755638e-06, "loss": 0.84898973, "num_input_tokens_seen": 404752096, "router_z_loss_mlp": 0.08599854, "routerloss_mlp": 0.0, "step": 4886, "time_per_iteration": 2.5939505100250244 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00589544, "balance_loss_mlp": 1.02856016, "diversity_loss_mlp": 0.13189431, "epoch": 0.9401692958830319, "flos": 1402500427776.0, "grad_norm": 0.0012775322428382279, "language_loss": 0.79332191, "learning_rate": 9.358266933618575e-06, "loss": 0.79921734, "num_input_tokens_seen": 404980944, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.00931658, "step": 4887, "time_per_iteration": 4.869856357574463 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047496, "balance_loss_mlp": 1.03856742, "diversity_loss_mlp": 0.0, "epoch": 0.9403616775682955, "flos": 540123448320.0, "grad_norm": 0.06148309655419226, "language_loss": 0.85074973, "learning_rate": 9.298368837495575e-06, "loss": 0.86122465, "num_input_tokens_seen": 405056688, "router_z_loss_mlp": 0.0892334, "routerloss_mlp": 0.0, "step": 4888, "time_per_iteration": 2.723494052886963 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01004242, "balance_loss_mlp": 0.99983096, "diversity_loss_mlp": 0.0, "epoch": 0.9405540592535591, "flos": 1322058184704.0, "grad_norm": 0.0026510918871896585, "language_loss": 0.75169432, "learning_rate": 9.238661248641089e-06, "loss": 0.76173675, "num_input_tokens_seen": 405284656, "router_z_loss_mlp": 0.04418945, "routerloss_mlp": 0.0, "step": 4889, "time_per_iteration": 4.887513637542725 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047457, "balance_loss_mlp": 1.03848016, "diversity_loss_mlp": 0.0, "epoch": 0.9407464409388226, "flos": 572362827264.0, "grad_norm": 0.07795508435687046, "language_loss": 0.83106863, "learning_rate": 9.179144190235799e-06, "loss": 0.8415432, "num_input_tokens_seen": 405351584, "router_z_loss_mlp": 0.08978271, "routerloss_mlp": 0.0, "step": 4890, "time_per_iteration": 2.6607882976531982 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046827, "balance_loss_mlp": 1.03781509, "diversity_loss_mlp": 0.0, "epoch": 0.9409388226240862, "flos": 511264203264.0, "grad_norm": 0.06087500740988416, "language_loss": 0.76773834, "learning_rate": 9.119817685386112e-06, "loss": 0.77820671, "num_input_tokens_seen": 405425712, "router_z_loss_mlp": 0.09008789, "routerloss_mlp": 0.0, "step": 4891, "time_per_iteration": 2.704505205154419 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01004256, "balance_loss_mlp": 0.99982125, "diversity_loss_mlp": 0.0, "epoch": 0.9411312043093497, "flos": 1569901077504.0, "grad_norm": 0.0026524442975608157, "language_loss": 0.80241883, "learning_rate": 9.06068175712471e-06, "loss": 0.81246138, "num_input_tokens_seen": 405655760, "router_z_loss_mlp": 0.04443359, "routerloss_mlp": 0.0, "step": 4892, "time_per_iteration": 4.861233949661255 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049636, "balance_loss_mlp": 1.04099298, "diversity_loss_mlp": 0.0, "epoch": 0.9413235859946133, "flos": 569469450240.0, "grad_norm": 0.0781928260181619, "language_loss": 0.78609961, "learning_rate": 9.001736428410234e-06, "loss": 0.79659593, "num_input_tokens_seen": 405731664, "router_z_loss_mlp": 0.08648682, "routerloss_mlp": 0.0, "step": 4893, "time_per_iteration": 2.7279999256134033 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048837, "balance_loss_mlp": 1.03969967, "diversity_loss_mlp": 0.0, "epoch": 0.9415159676798769, "flos": 781905747456.0, "grad_norm": 0.06974865955281616, "language_loss": 0.80413878, "learning_rate": 8.942981722127263e-06, "loss": 0.81462717, "num_input_tokens_seen": 405808128, "router_z_loss_mlp": 0.0914917, "routerloss_mlp": 0.0, "step": 4894, "time_per_iteration": 3.0058786869049072 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050294, "balance_loss_mlp": 1.04135358, "diversity_loss_mlp": 0.0, "epoch": 0.9417083493651405, "flos": 849341749248.0, "grad_norm": 0.08932063460271895, "language_loss": 0.79991817, "learning_rate": 8.884417661086331e-06, "loss": 0.81042111, "num_input_tokens_seen": 405892448, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 4895, "time_per_iteration": 3.1561882495880127 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046423, "balance_loss_mlp": 1.03778601, "diversity_loss_mlp": 0.0, "epoch": 0.941900731050404, "flos": 529333304832.0, "grad_norm": 0.0641512346414091, "language_loss": 0.85852486, "learning_rate": 8.826044268024025e-06, "loss": 0.86898911, "num_input_tokens_seen": 405966736, "router_z_loss_mlp": 0.08642578, "routerloss_mlp": 0.0, "step": 4896, "time_per_iteration": 2.6913957595825195 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045265, "balance_loss_mlp": 1.03639615, "diversity_loss_mlp": 0.0, "epoch": 0.9420931127356675, "flos": 557073303552.0, "grad_norm": 0.0665448744143015, "language_loss": 0.80267036, "learning_rate": 8.767861565602997e-06, "loss": 0.81312299, "num_input_tokens_seen": 406043264, "router_z_loss_mlp": 0.08874512, "routerloss_mlp": 0.0, "step": 4897, "time_per_iteration": 2.7335498332977295 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104815, "balance_loss_mlp": 1.0395788, "diversity_loss_mlp": 0.0, "epoch": 0.9422854944209311, "flos": 652543340544.0, "grad_norm": 0.07266036540005272, "language_loss": 0.86784083, "learning_rate": 8.709869576411733e-06, "loss": 0.87832236, "num_input_tokens_seen": 406119552, "router_z_loss_mlp": 0.08581543, "routerloss_mlp": 0.0, "step": 4898, "time_per_iteration": 2.820343255996704 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049698, "balance_loss_mlp": 1.04090655, "diversity_loss_mlp": 0.0, "epoch": 0.9424778761061947, "flos": 553685829120.0, "grad_norm": 0.07366201746067845, "language_loss": 0.84326768, "learning_rate": 8.65206832296478e-06, "loss": 0.85376465, "num_input_tokens_seen": 406192464, "router_z_loss_mlp": 0.0880127, "routerloss_mlp": 0.0, "step": 4899, "time_per_iteration": 2.708554744720459 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045104, "balance_loss_mlp": 1.03626442, "diversity_loss_mlp": 0.0, "epoch": 0.9426702577914583, "flos": 588559053312.0, "grad_norm": 0.07321817964783915, "language_loss": 0.79721165, "learning_rate": 8.594457827702406e-06, "loss": 0.80766267, "num_input_tokens_seen": 406262640, "router_z_loss_mlp": 0.08843994, "routerloss_mlp": 0.0, "step": 4900, "time_per_iteration": 2.674393892288208 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054375, "balance_loss_mlp": 1.04557145, "diversity_loss_mlp": 0.0, "epoch": 0.9428626394767218, "flos": 616625021952.0, "grad_norm": 0.0749978632070715, "language_loss": 0.78455758, "learning_rate": 8.537038112991114e-06, "loss": 0.79510128, "num_input_tokens_seen": 406341328, "router_z_loss_mlp": 0.08807373, "routerloss_mlp": 0.0, "step": 4901, "time_per_iteration": 2.805161952972412 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047513, "balance_loss_mlp": 1.03873909, "diversity_loss_mlp": 0.0, "epoch": 0.9430550211619854, "flos": 610410981888.0, "grad_norm": 0.07047076389805079, "language_loss": 0.82071722, "learning_rate": 8.479809201123178e-06, "loss": 0.83119237, "num_input_tokens_seen": 406418864, "router_z_loss_mlp": 0.08789062, "routerloss_mlp": 0.0, "step": 4902, "time_per_iteration": 2.732999324798584 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047853, "balance_loss_mlp": 1.03907359, "diversity_loss_mlp": 0.0, "epoch": 0.943247402847249, "flos": 565990571520.0, "grad_norm": 0.06786486493908951, "language_loss": 0.78043211, "learning_rate": 8.422771114316885e-06, "loss": 0.79091066, "num_input_tokens_seen": 406492320, "router_z_loss_mlp": 0.08782959, "routerloss_mlp": 0.0, "step": 4903, "time_per_iteration": 2.7100279331207275 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048262, "balance_loss_mlp": 1.03943491, "diversity_loss_mlp": 0.0, "epoch": 0.9434397845325125, "flos": 527040483840.0, "grad_norm": 0.07474785644916408, "language_loss": 0.81409293, "learning_rate": 8.365923874716297e-06, "loss": 0.82457554, "num_input_tokens_seen": 406560448, "router_z_loss_mlp": 0.08825684, "routerloss_mlp": 0.0, "step": 4904, "time_per_iteration": 2.598313093185425 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046884, "balance_loss_mlp": 1.0381397, "diversity_loss_mlp": 0.0, "epoch": 0.943632166217776, "flos": 593451214848.0, "grad_norm": 0.06861839019347821, "language_loss": 0.82857311, "learning_rate": 8.309267504391593e-06, "loss": 0.83904195, "num_input_tokens_seen": 406631376, "router_z_loss_mlp": 0.08758545, "routerloss_mlp": 0.0, "step": 4905, "time_per_iteration": 2.7130138874053955 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010495, "balance_loss_mlp": 1.04049969, "diversity_loss_mlp": 0.0, "epoch": 0.9438245479030396, "flos": 572770289664.0, "grad_norm": 0.05740754157545699, "language_loss": 0.85487771, "learning_rate": 8.252802025338623e-06, "loss": 0.86537278, "num_input_tokens_seen": 406713728, "router_z_loss_mlp": 0.08996582, "routerloss_mlp": 0.0, "step": 4906, "time_per_iteration": 2.819689989089966 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046047, "balance_loss_mlp": 1.03723109, "diversity_loss_mlp": 0.0, "epoch": 0.9440169295883032, "flos": 488258523648.0, "grad_norm": 0.0749683755111213, "language_loss": 0.81567025, "learning_rate": 8.196527459479242e-06, "loss": 0.82613063, "num_input_tokens_seen": 406779168, "router_z_loss_mlp": 0.0881958, "routerloss_mlp": 0.0, "step": 4907, "time_per_iteration": 2.554344415664673 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049809, "balance_loss_mlp": 1.04098761, "diversity_loss_mlp": 0.0, "epoch": 0.9442093112735668, "flos": 731742999552.0, "grad_norm": 0.06901073906266146, "language_loss": 0.73883832, "learning_rate": 8.140443828661137e-06, "loss": 0.74933642, "num_input_tokens_seen": 406860816, "router_z_loss_mlp": 0.08831787, "routerloss_mlp": 0.0, "step": 4908, "time_per_iteration": 3.0110507011413574 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047925, "balance_loss_mlp": 1.03897214, "diversity_loss_mlp": 0.0, "epoch": 0.9444016929588304, "flos": 571031036928.0, "grad_norm": 0.07411833720689497, "language_loss": 0.8239246, "learning_rate": 8.084551154658004e-06, "loss": 0.83440387, "num_input_tokens_seen": 406929888, "router_z_loss_mlp": 0.08966064, "routerloss_mlp": 0.0, "step": 4909, "time_per_iteration": 2.6770436763763428 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047051, "balance_loss_mlp": 1.03801453, "diversity_loss_mlp": 0.0, "epoch": 0.9445940746440938, "flos": 509292582912.0, "grad_norm": 0.06788128134122538, "language_loss": 0.86283165, "learning_rate": 8.028849459169318e-06, "loss": 0.8733021, "num_input_tokens_seen": 406998224, "router_z_loss_mlp": 0.09039307, "routerloss_mlp": 0.0, "step": 4910, "time_per_iteration": 2.582549810409546 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049067, "balance_loss_mlp": 1.04030466, "diversity_loss_mlp": 0.0, "epoch": 0.9447864563293574, "flos": 624556293120.0, "grad_norm": 0.0678450295570026, "language_loss": 0.80976182, "learning_rate": 7.97333876382028e-06, "loss": 0.82025248, "num_input_tokens_seen": 407075088, "router_z_loss_mlp": 0.08764648, "routerloss_mlp": 0.0, "step": 4911, "time_per_iteration": 2.8425984382629395 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049899, "balance_loss_mlp": 1.04112482, "diversity_loss_mlp": 0.0, "epoch": 0.944978838014621, "flos": 505270047744.0, "grad_norm": 0.08525541673585063, "language_loss": 0.81182563, "learning_rate": 7.918019090162098e-06, "loss": 0.82232463, "num_input_tokens_seen": 407147792, "router_z_loss_mlp": 0.08789062, "routerloss_mlp": 0.0, "step": 4912, "time_per_iteration": 2.7192227840423584 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01004436, "balance_loss_mlp": 1.00002539, "diversity_loss_mlp": 0.0, "epoch": 0.9451712196998846, "flos": 1484205451776.0, "grad_norm": 0.00558203174928547, "language_loss": 0.78287339, "learning_rate": 7.862890459671812e-06, "loss": 0.79291773, "num_input_tokens_seen": 407387216, "router_z_loss_mlp": 0.04418945, "routerloss_mlp": 0.0, "step": 4913, "time_per_iteration": 4.945667505264282 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050028, "balance_loss_mlp": 1.0412302, "diversity_loss_mlp": 0.0, "epoch": 0.9453636013851482, "flos": 521137732608.0, "grad_norm": 0.07323836789774518, "language_loss": 0.90345061, "learning_rate": 7.80795289375219e-06, "loss": 0.91395086, "num_input_tokens_seen": 407457664, "router_z_loss_mlp": 0.08807373, "routerloss_mlp": 0.0, "step": 4914, "time_per_iteration": 2.628188371658325 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01004442, "balance_loss_mlp": 1.00000703, "diversity_loss_mlp": 0.0, "epoch": 0.9455559830704117, "flos": 1496902975488.0, "grad_norm": 0.00558152160329536, "language_loss": 0.8356235, "learning_rate": 7.75320641373195e-06, "loss": 0.8456679, "num_input_tokens_seen": 407700256, "router_z_loss_mlp": 0.04443359, "routerloss_mlp": 0.0, "step": 4915, "time_per_iteration": 4.940939426422119 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049716, "balance_loss_mlp": 1.04091215, "diversity_loss_mlp": 0.0, "epoch": 0.9457483647556753, "flos": 498126910464.0, "grad_norm": 0.05816068289189103, "language_loss": 0.81779099, "learning_rate": 7.698651040865534e-06, "loss": 0.8282882, "num_input_tokens_seen": 407770080, "router_z_loss_mlp": 0.08813477, "routerloss_mlp": 0.0, "step": 4916, "time_per_iteration": 2.622225522994995 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045766, "balance_loss_mlp": 1.03712368, "diversity_loss_mlp": 0.0, "epoch": 0.9459407464409388, "flos": 1019405979648.0, "grad_norm": 0.06122686842867312, "language_loss": 0.82315564, "learning_rate": 7.644286796333222e-06, "loss": 0.83361328, "num_input_tokens_seen": 407854640, "router_z_loss_mlp": 0.08654785, "routerloss_mlp": 0.0, "step": 4917, "time_per_iteration": 3.3565821647644043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050444, "balance_loss_mlp": 1.04189634, "diversity_loss_mlp": 0.0, "epoch": 0.9461331281262024, "flos": 513589330944.0, "grad_norm": 0.07064430272408662, "language_loss": 0.81672692, "learning_rate": 7.590113701241075e-06, "loss": 0.82723141, "num_input_tokens_seen": 407922704, "router_z_loss_mlp": 0.08557129, "routerloss_mlp": 0.0, "step": 4918, "time_per_iteration": 2.609248399734497 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049398, "balance_loss_mlp": 1.04064822, "diversity_loss_mlp": 0.0, "epoch": 0.9463255098114659, "flos": 528023909376.0, "grad_norm": 0.07970710282703287, "language_loss": 0.7821058, "learning_rate": 7.536131776620936e-06, "loss": 0.7925998, "num_input_tokens_seen": 407991136, "router_z_loss_mlp": 0.08758545, "routerloss_mlp": 0.0, "step": 4919, "time_per_iteration": 2.6066248416900635 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049566, "balance_loss_mlp": 1.0406847, "diversity_loss_mlp": 0.0, "epoch": 0.9465178914967295, "flos": 506043500544.0, "grad_norm": 0.08687319482199532, "language_loss": 0.83590424, "learning_rate": 7.482341043430485e-06, "loss": 0.8463999, "num_input_tokens_seen": 408056576, "router_z_loss_mlp": 0.08886719, "routerloss_mlp": 0.0, "step": 4920, "time_per_iteration": 2.579651117324829 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045992, "balance_loss_mlp": 1.03711653, "diversity_loss_mlp": 0.0, "epoch": 0.9467102731819931, "flos": 660254727168.0, "grad_norm": 0.06849366756552606, "language_loss": 0.85644251, "learning_rate": 7.428741522553184e-06, "loss": 0.86690247, "num_input_tokens_seen": 408136960, "router_z_loss_mlp": 0.08886719, "routerloss_mlp": 0.0, "step": 4921, "time_per_iteration": 2.9116263389587402 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045153, "balance_loss_mlp": 1.03621817, "diversity_loss_mlp": 0.0, "epoch": 0.9469026548672567, "flos": 675183403008.0, "grad_norm": 0.06484399276768851, "language_loss": 0.89472318, "learning_rate": 7.375333234798054e-06, "loss": 0.90517473, "num_input_tokens_seen": 408218304, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 4922, "time_per_iteration": 2.9387049674987793 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047394, "balance_loss_mlp": 1.03844738, "diversity_loss_mlp": 0.0, "epoch": 0.9470950365525203, "flos": 513964859904.0, "grad_norm": 0.08622456288461161, "language_loss": 0.80096912, "learning_rate": 7.32211620090012e-06, "loss": 0.81144309, "num_input_tokens_seen": 408287936, "router_z_loss_mlp": 0.08953857, "routerloss_mlp": 0.0, "step": 4923, "time_per_iteration": 2.6302578449249268 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050313, "balance_loss_mlp": 1.04158056, "diversity_loss_mlp": 0.0, "epoch": 0.9472874182377837, "flos": 550103063040.0, "grad_norm": 0.0601694962527871, "language_loss": 0.81003237, "learning_rate": 7.269090441520132e-06, "loss": 0.82053542, "num_input_tokens_seen": 408365568, "router_z_loss_mlp": 0.08746338, "routerloss_mlp": 0.0, "step": 4924, "time_per_iteration": 2.808788299560547 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051085, "balance_loss_mlp": 1.04240632, "diversity_loss_mlp": 0.0, "epoch": 0.9474797999230473, "flos": 542769776640.0, "grad_norm": 0.06384621728093878, "language_loss": 0.80346602, "learning_rate": 7.216255977244457e-06, "loss": 0.81397688, "num_input_tokens_seen": 408431248, "router_z_loss_mlp": 0.08691406, "routerloss_mlp": 0.0, "step": 4925, "time_per_iteration": 2.6172335147857666 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049268, "balance_loss_mlp": 1.04039288, "diversity_loss_mlp": 0.0, "epoch": 0.9476721816083109, "flos": 844644879360.0, "grad_norm": 0.06326857300487894, "language_loss": 0.85833907, "learning_rate": 7.163612828585242e-06, "loss": 0.86883175, "num_input_tokens_seen": 408514112, "router_z_loss_mlp": 0.08886719, "routerloss_mlp": 0.0, "step": 4926, "time_per_iteration": 3.1013805866241455 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046961, "balance_loss_mlp": 1.03822935, "diversity_loss_mlp": 0.0, "epoch": 0.9478645632935745, "flos": 638002676736.0, "grad_norm": 0.0714765450100148, "language_loss": 0.7945109, "learning_rate": 7.1111610159803605e-06, "loss": 0.80498052, "num_input_tokens_seen": 408585968, "router_z_loss_mlp": 0.08740234, "routerloss_mlp": 0.0, "step": 4927, "time_per_iteration": 2.7759459018707275 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044976, "balance_loss_mlp": 1.03620195, "diversity_loss_mlp": 0.0, "epoch": 0.948056944978838, "flos": 656832748032.0, "grad_norm": 0.08515861260909238, "language_loss": 0.75973248, "learning_rate": 7.058900559793469e-06, "loss": 0.77018219, "num_input_tokens_seen": 408665456, "router_z_loss_mlp": 0.08782959, "routerloss_mlp": 0.0, "step": 4928, "time_per_iteration": 2.8861470222473145 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052944, "balance_loss_mlp": 1.04416978, "diversity_loss_mlp": 0.0, "epoch": 0.9482493266641016, "flos": 440907660288.0, "grad_norm": 0.06735199813953592, "language_loss": 0.83267879, "learning_rate": 7.00683148031378e-06, "loss": 0.84320819, "num_input_tokens_seen": 408730192, "router_z_loss_mlp": 0.08776855, "routerloss_mlp": 0.0, "step": 4929, "time_per_iteration": 2.510803699493408 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045383, "balance_loss_mlp": 1.03666258, "diversity_loss_mlp": 0.0, "epoch": 0.9484417083493651, "flos": 545989123584.0, "grad_norm": 0.06926665939050473, "language_loss": 0.78147107, "learning_rate": 6.9549537977564024e-06, "loss": 0.79192489, "num_input_tokens_seen": 408807616, "router_z_loss_mlp": 0.08728027, "routerloss_mlp": 0.0, "step": 4930, "time_per_iteration": 2.7705516815185547 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00784775, "balance_loss_mlp": 1.32251549, "diversity_loss_mlp": 0.22577199, "epoch": 0.9486340900346287, "flos": 538598937600.0, "grad_norm": 0.030705907107943475, "language_loss": 0.80018926, "learning_rate": 6.903267532262003e-06, "loss": 0.80803692, "num_input_tokens_seen": 408883552, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01063121, "step": 4931, "time_per_iteration": 2.700617551803589 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052654, "balance_loss_mlp": 1.04359388, "diversity_loss_mlp": 0.0, "epoch": 0.9488264717198923, "flos": 681669457920.0, "grad_norm": 0.07163166168335688, "language_loss": 0.85786635, "learning_rate": 6.851772703896975e-06, "loss": 0.86839288, "num_input_tokens_seen": 408956400, "router_z_loss_mlp": 0.09057617, "routerloss_mlp": 0.0, "step": 4932, "time_per_iteration": 2.8230526447296143 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045477, "balance_loss_mlp": 1.03682232, "diversity_loss_mlp": 0.0, "epoch": 0.9490188534051558, "flos": 462603944448.0, "grad_norm": 0.07113425512473334, "language_loss": 0.88082981, "learning_rate": 6.8004693326533805e-06, "loss": 0.89128458, "num_input_tokens_seen": 409019904, "router_z_loss_mlp": 0.08660889, "routerloss_mlp": 0.0, "step": 4933, "time_per_iteration": 2.5242044925689697 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052399, "balance_loss_mlp": 1.04369068, "diversity_loss_mlp": 0.0, "epoch": 0.9492112350904194, "flos": 543135393792.0, "grad_norm": 0.06957529053478449, "language_loss": 0.82772219, "learning_rate": 6.7493574384489e-06, "loss": 0.83824623, "num_input_tokens_seen": 409094288, "router_z_loss_mlp": 0.0871582, "routerloss_mlp": 0.0, "step": 4934, "time_per_iteration": 2.682114362716675 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046039, "balance_loss_mlp": 1.03765225, "diversity_loss_mlp": 0.0, "epoch": 0.949403616775683, "flos": 550322947584.0, "grad_norm": 0.06306988880080433, "language_loss": 0.8386761, "learning_rate": 6.698437041126992e-06, "loss": 0.84913647, "num_input_tokens_seen": 409169120, "router_z_loss_mlp": 0.0838623, "routerloss_mlp": 0.0, "step": 4935, "time_per_iteration": 2.726893424987793 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046934, "balance_loss_mlp": 1.03838086, "diversity_loss_mlp": 0.0, "epoch": 0.9495959984609466, "flos": 598383023616.0, "grad_norm": 0.05973475098726946, "language_loss": 0.82893109, "learning_rate": 6.647708160456678e-06, "loss": 0.83940041, "num_input_tokens_seen": 409243200, "router_z_loss_mlp": 0.08563232, "routerloss_mlp": 0.0, "step": 4936, "time_per_iteration": 2.729111671447754 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046751, "balance_loss_mlp": 1.03814435, "diversity_loss_mlp": 0.0, "epoch": 0.94978838014621, "flos": 608409626112.0, "grad_norm": 0.07659756248200288, "language_loss": 0.82697654, "learning_rate": 6.597170816132702e-06, "loss": 0.83744407, "num_input_tokens_seen": 409319264, "router_z_loss_mlp": 0.08618164, "routerloss_mlp": 0.0, "step": 4937, "time_per_iteration": 2.8081254959106445 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00784639, "balance_loss_mlp": 1.32296765, "diversity_loss_mlp": 0.22491853, "epoch": 0.9499807618314736, "flos": 540832660992.0, "grad_norm": 0.031155014429691368, "language_loss": 0.86999297, "learning_rate": 6.546825027775427e-06, "loss": 0.87783933, "num_input_tokens_seen": 409389840, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01069584, "step": 4938, "time_per_iteration": 2.647392749786377 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049194, "balance_loss_mlp": 1.04043269, "diversity_loss_mlp": 0.0, "epoch": 0.9501731435167372, "flos": 594600196608.0, "grad_norm": 0.06549207812906088, "language_loss": 0.82709306, "learning_rate": 6.496670814930717e-06, "loss": 0.83758503, "num_input_tokens_seen": 409458752, "router_z_loss_mlp": 0.08770752, "routerloss_mlp": 0.0, "step": 4939, "time_per_iteration": 2.6947948932647705 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049253, "balance_loss_mlp": 1.04041934, "diversity_loss_mlp": 0.0, "epoch": 0.9503655252020008, "flos": 454138928640.0, "grad_norm": 0.0674263053300071, "language_loss": 0.80045903, "learning_rate": 6.446708197070161e-06, "loss": 0.81095159, "num_input_tokens_seen": 409525008, "router_z_loss_mlp": 0.08837891, "routerloss_mlp": 0.0, "step": 4940, "time_per_iteration": 2.537261486053467 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047818, "balance_loss_mlp": 1.03906798, "diversity_loss_mlp": 0.0, "epoch": 0.9505579068872644, "flos": 667944092160.0, "grad_norm": 0.06671960471522939, "language_loss": 0.84743893, "learning_rate": 6.396937193591079e-06, "loss": 0.85791707, "num_input_tokens_seen": 409603376, "router_z_loss_mlp": 0.08764648, "routerloss_mlp": 0.0, "step": 4941, "time_per_iteration": 2.7824418544769287 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051988, "balance_loss_mlp": 1.04320264, "diversity_loss_mlp": 0.0, "epoch": 0.9507502885725279, "flos": 402207192576.0, "grad_norm": 0.07518292778028754, "language_loss": 0.81734824, "learning_rate": 6.347357823816235e-06, "loss": 0.8278681, "num_input_tokens_seen": 409667168, "router_z_loss_mlp": 0.08795166, "routerloss_mlp": 0.0, "step": 4942, "time_per_iteration": 2.5175111293792725 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045578, "balance_loss_mlp": 1.03662586, "diversity_loss_mlp": 0.0, "epoch": 0.9509426702577914, "flos": 700358565888.0, "grad_norm": 0.06073583327995898, "language_loss": 0.79565704, "learning_rate": 6.297970106994011e-06, "loss": 0.80611289, "num_input_tokens_seen": 409746832, "router_z_loss_mlp": 0.08953857, "routerloss_mlp": 0.0, "step": 4943, "time_per_iteration": 2.98564076423645 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044523, "balance_loss_mlp": 1.03589809, "diversity_loss_mlp": 0.0, "epoch": 0.951135051943055, "flos": 501415640064.0, "grad_norm": 0.07464458367850044, "language_loss": 0.82931554, "learning_rate": 6.2487740622985126e-06, "loss": 0.83976078, "num_input_tokens_seen": 409813792, "router_z_loss_mlp": 0.08630371, "routerloss_mlp": 0.0, "step": 4944, "time_per_iteration": 2.586824417114258 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048064, "balance_loss_mlp": 1.03944492, "diversity_loss_mlp": 0.0, "epoch": 0.9513274336283186, "flos": 614621094912.0, "grad_norm": 0.0706686343064775, "language_loss": 0.81845355, "learning_rate": 6.1997697088292395e-06, "loss": 0.82893419, "num_input_tokens_seen": 409898848, "router_z_loss_mlp": 0.08612061, "routerloss_mlp": 0.0, "step": 4945, "time_per_iteration": 2.921309232711792 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046293, "balance_loss_mlp": 1.03738809, "diversity_loss_mlp": 0.0, "epoch": 0.9515198153135821, "flos": 519586057728.0, "grad_norm": 0.07524726970917751, "language_loss": 0.82137179, "learning_rate": 6.150957065611363e-06, "loss": 0.83183479, "num_input_tokens_seen": 409966368, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 4946, "time_per_iteration": 2.5640242099761963 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049384, "balance_loss_mlp": 1.04034781, "diversity_loss_mlp": 0.0, "epoch": 0.9517121969988457, "flos": 664954168320.0, "grad_norm": 0.07065066286266242, "language_loss": 0.76635486, "learning_rate": 6.102336151595667e-06, "loss": 0.77684867, "num_input_tokens_seen": 410048496, "router_z_loss_mlp": 0.09033203, "routerloss_mlp": 0.0, "step": 4947, "time_per_iteration": 2.965193033218384 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049032, "balance_loss_mlp": 1.04028833, "diversity_loss_mlp": 0.0, "epoch": 0.9519045786841093, "flos": 676409107968.0, "grad_norm": 0.06944081610529035, "language_loss": 0.75779366, "learning_rate": 6.053906985658553e-06, "loss": 0.76828402, "num_input_tokens_seen": 410121840, "router_z_loss_mlp": 0.08746338, "routerloss_mlp": 0.0, "step": 4948, "time_per_iteration": 2.8114254474639893 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047439, "balance_loss_mlp": 1.03859949, "diversity_loss_mlp": 0.0, "epoch": 0.9520969603693729, "flos": 652901617152.0, "grad_norm": 0.06267886834412634, "language_loss": 0.80306596, "learning_rate": 6.005669586601814e-06, "loss": 0.81354034, "num_input_tokens_seen": 410199152, "router_z_loss_mlp": 0.08843994, "routerloss_mlp": 0.0, "step": 4949, "time_per_iteration": 2.829516887664795 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047548, "balance_loss_mlp": 1.03901839, "diversity_loss_mlp": 0.0, "epoch": 0.9522893420546364, "flos": 743284200960.0, "grad_norm": 0.06460536676220141, "language_loss": 0.83404064, "learning_rate": 5.957623973152748e-06, "loss": 0.84451616, "num_input_tokens_seen": 410285392, "router_z_loss_mlp": 0.08538818, "routerloss_mlp": 0.0, "step": 4950, "time_per_iteration": 3.064345359802246 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047703, "balance_loss_mlp": 1.03875649, "diversity_loss_mlp": 0.0, "epoch": 0.9524817237398999, "flos": 761696898048.0, "grad_norm": 0.07065514061093704, "language_loss": 0.80931592, "learning_rate": 5.909770163964545e-06, "loss": 0.81979299, "num_input_tokens_seen": 410359872, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 4951, "time_per_iteration": 2.9210174083709717 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045969, "balance_loss_mlp": 1.03724885, "diversity_loss_mlp": 0.0, "epoch": 0.9526741054251635, "flos": 529125903360.0, "grad_norm": 0.0779800356462361, "language_loss": 0.82006431, "learning_rate": 5.8621081776155105e-06, "loss": 0.83052403, "num_input_tokens_seen": 410425728, "router_z_loss_mlp": 0.08734131, "routerloss_mlp": 0.0, "step": 4952, "time_per_iteration": 2.570007801055908 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048957, "balance_loss_mlp": 1.0397898, "diversity_loss_mlp": 0.0, "epoch": 0.9528664871104271, "flos": 488441332224.0, "grad_norm": 0.07317068745782636, "language_loss": 0.81126779, "learning_rate": 5.814638032609787e-06, "loss": 0.82175738, "num_input_tokens_seen": 410496080, "router_z_loss_mlp": 0.0916748, "routerloss_mlp": 0.0, "step": 4953, "time_per_iteration": 2.593344211578369 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047896, "balance_loss_mlp": 1.03926563, "diversity_loss_mlp": 0.0, "epoch": 0.9530588687956907, "flos": 517745115648.0, "grad_norm": 0.06495580169291973, "language_loss": 0.85402286, "learning_rate": 5.76735974737691e-06, "loss": 0.86450183, "num_input_tokens_seen": 410576448, "router_z_loss_mlp": 0.08642578, "routerloss_mlp": 0.0, "step": 4954, "time_per_iteration": 2.757946491241455 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00780626, "balance_loss_mlp": 1.31376719, "diversity_loss_mlp": 0.22618601, "epoch": 0.9532512504809542, "flos": 675148898304.0, "grad_norm": 0.03586731087797504, "language_loss": 0.81228065, "learning_rate": 5.720273340271864e-06, "loss": 0.82008696, "num_input_tokens_seen": 410655792, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0106497, "step": 4955, "time_per_iteration": 2.883862018585205 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049414, "balance_loss_mlp": 1.04027104, "diversity_loss_mlp": 0.0, "epoch": 0.9534436321662177, "flos": 489523502592.0, "grad_norm": 0.07193968737801358, "language_loss": 0.84132719, "learning_rate": 5.673378829575249e-06, "loss": 0.85182136, "num_input_tokens_seen": 410725440, "router_z_loss_mlp": 0.09143066, "routerloss_mlp": 0.0, "step": 4956, "time_per_iteration": 2.5883569717407227 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046949, "balance_loss_mlp": 1.03826427, "diversity_loss_mlp": 0.0, "epoch": 0.9536360138514813, "flos": 496585147392.0, "grad_norm": 0.06822952225428794, "language_loss": 0.81915605, "learning_rate": 5.626676233493167e-06, "loss": 0.82962549, "num_input_tokens_seen": 410797552, "router_z_loss_mlp": 0.0869751, "routerloss_mlp": 0.0, "step": 4957, "time_per_iteration": 2.630600690841675 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048744, "balance_loss_mlp": 1.040012, "diversity_loss_mlp": 0.0, "epoch": 0.9538283955367449, "flos": 801462283776.0, "grad_norm": 0.05995693166435021, "language_loss": 0.83973289, "learning_rate": 5.580165570157114e-06, "loss": 0.85022032, "num_input_tokens_seen": 410876736, "router_z_loss_mlp": 0.08734131, "routerloss_mlp": 0.0, "step": 4958, "time_per_iteration": 3.0566930770874023 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045737, "balance_loss_mlp": 1.0366534, "diversity_loss_mlp": 0.0, "epoch": 0.9540207772220085, "flos": 556668039168.0, "grad_norm": 0.06699001332746012, "language_loss": 0.80331284, "learning_rate": 5.533846857624203e-06, "loss": 0.81377017, "num_input_tokens_seen": 410955632, "router_z_loss_mlp": 0.09088135, "routerloss_mlp": 0.0, "step": 4959, "time_per_iteration": 2.761378049850464 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047149, "balance_loss_mlp": 1.03821445, "diversity_loss_mlp": 0.0, "epoch": 0.954213158907272, "flos": 684505935360.0, "grad_norm": 0.0761611393687458, "language_loss": 0.82048774, "learning_rate": 5.487720113876882e-06, "loss": 0.83095926, "num_input_tokens_seen": 411038480, "router_z_loss_mlp": 0.08947754, "routerloss_mlp": 0.0, "step": 4960, "time_per_iteration": 2.932245969772339 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048944, "balance_loss_mlp": 1.04009259, "diversity_loss_mlp": 0.0, "epoch": 0.9544055405925356, "flos": 535752548352.0, "grad_norm": 0.06840338993330367, "language_loss": 0.8257823, "learning_rate": 5.441785356823214e-06, "loss": 0.83627176, "num_input_tokens_seen": 411109744, "router_z_loss_mlp": 0.08856201, "routerloss_mlp": 0.0, "step": 4961, "time_per_iteration": 2.7189135551452637 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049536, "balance_loss_mlp": 1.04058886, "diversity_loss_mlp": 0.0, "epoch": 0.9545979222777992, "flos": 825404401152.0, "grad_norm": 0.06804248679935226, "language_loss": 0.80613565, "learning_rate": 5.3960426042965476e-06, "loss": 0.81663102, "num_input_tokens_seen": 411202192, "router_z_loss_mlp": 0.08947754, "routerloss_mlp": 0.0, "step": 4962, "time_per_iteration": 3.102736711502075 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0078831, "balance_loss_mlp": 1.33004642, "diversity_loss_mlp": 0.2248106, "epoch": 0.9547903039630627, "flos": 761691755520.0, "grad_norm": 0.03404897095721445, "language_loss": 0.77822566, "learning_rate": 5.3504918740558405e-06, "loss": 0.78610873, "num_input_tokens_seen": 411289248, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01088165, "step": 4963, "time_per_iteration": 3.1009397506713867 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051718, "balance_loss_mlp": 1.04287314, "diversity_loss_mlp": 0.0, "epoch": 0.9549826856483262, "flos": 515306562048.0, "grad_norm": 0.0785854138679803, "language_loss": 0.82759595, "learning_rate": 5.3051331837855045e-06, "loss": 0.83811319, "num_input_tokens_seen": 411355232, "router_z_loss_mlp": 0.08856201, "routerloss_mlp": 0.0, "step": 4964, "time_per_iteration": 2.5947694778442383 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052088, "balance_loss_mlp": 1.04327834, "diversity_loss_mlp": 0.0, "epoch": 0.9551750673335898, "flos": 643107382272.0, "grad_norm": 0.06792534083569658, "language_loss": 0.82819939, "learning_rate": 5.259966551095341e-06, "loss": 0.83872032, "num_input_tokens_seen": 411432288, "router_z_loss_mlp": 0.0881958, "routerloss_mlp": 0.0, "step": 4965, "time_per_iteration": 2.803609609603882 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050399, "balance_loss_mlp": 1.04159546, "diversity_loss_mlp": 0.0, "epoch": 0.9553674490188534, "flos": 472208030208.0, "grad_norm": 0.06616240585597659, "language_loss": 0.8283782, "learning_rate": 5.214991993520546e-06, "loss": 0.83888221, "num_input_tokens_seen": 411499376, "router_z_loss_mlp": 0.08813477, "routerloss_mlp": 0.0, "step": 4966, "time_per_iteration": 2.584310531616211 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048058, "balance_loss_mlp": 1.03910518, "diversity_loss_mlp": 0.0, "epoch": 0.955559830704117, "flos": 528317945856.0, "grad_norm": 0.07793598675668457, "language_loss": 0.8188796, "learning_rate": 5.170209528521763e-06, "loss": 0.82936013, "num_input_tokens_seen": 411564976, "router_z_loss_mlp": 0.08966064, "routerloss_mlp": 0.0, "step": 4967, "time_per_iteration": 2.592332601547241 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104625, "balance_loss_mlp": 1.03739893, "diversity_loss_mlp": 0.0, "epoch": 0.9557522123893806, "flos": 548168518656.0, "grad_norm": 0.06516874865343447, "language_loss": 0.84235787, "learning_rate": 5.125619173485196e-06, "loss": 0.85282034, "num_input_tokens_seen": 411636464, "router_z_loss_mlp": 0.08856201, "routerloss_mlp": 0.0, "step": 4968, "time_per_iteration": 2.6265814304351807 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044806, "balance_loss_mlp": 1.03580022, "diversity_loss_mlp": 0.0, "epoch": 0.955944594074644, "flos": 509465479680.0, "grad_norm": 0.05920920196225761, "language_loss": 0.81924808, "learning_rate": 5.08122094572222e-06, "loss": 0.82969612, "num_input_tokens_seen": 411710672, "router_z_loss_mlp": 0.09002686, "routerloss_mlp": 0.0, "step": 4969, "time_per_iteration": 2.668456554412842 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104686, "balance_loss_mlp": 1.03809857, "diversity_loss_mlp": 0.0, "epoch": 0.9561369757599076, "flos": 527578997760.0, "grad_norm": 0.07042790663947672, "language_loss": 0.79412282, "learning_rate": 5.037014862469824e-06, "loss": 0.80459142, "num_input_tokens_seen": 411785616, "router_z_loss_mlp": 0.08770752, "routerloss_mlp": 0.0, "step": 4970, "time_per_iteration": 2.7282607555389404 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050652, "balance_loss_mlp": 1.0418489, "diversity_loss_mlp": 0.0, "epoch": 0.9563293574451712, "flos": 498201062400.0, "grad_norm": 0.06399713345893698, "language_loss": 0.80029887, "learning_rate": 4.993000940890391e-06, "loss": 0.81080544, "num_input_tokens_seen": 411854832, "router_z_loss_mlp": 0.08813477, "routerloss_mlp": 0.0, "step": 4971, "time_per_iteration": 2.6104438304901123 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00588737, "balance_loss_mlp": 1.02730632, "diversity_loss_mlp": 0.13157621, "epoch": 0.9565217391304348, "flos": 1408875628032.0, "grad_norm": 0.0012650050689020306, "language_loss": 0.81773561, "learning_rate": 4.949179198071585e-06, "loss": 0.823623, "num_input_tokens_seen": 412081856, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0092962, "step": 4972, "time_per_iteration": 4.941720008850098 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044467, "balance_loss_mlp": 1.03565741, "diversity_loss_mlp": 0.0, "epoch": 0.9567141208156984, "flos": 503846853120.0, "grad_norm": 0.059256065258913096, "language_loss": 0.78335071, "learning_rate": 4.905549651026464e-06, "loss": 0.79379541, "num_input_tokens_seen": 412155600, "router_z_loss_mlp": 0.0881958, "routerloss_mlp": 0.0, "step": 4973, "time_per_iteration": 2.788773775100708 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049048, "balance_loss_mlp": 1.04036331, "diversity_loss_mlp": 0.0, "epoch": 0.9569065025009619, "flos": 433213526016.0, "grad_norm": 0.08268664024117288, "language_loss": 0.79965454, "learning_rate": 4.86211231669359e-06, "loss": 0.81014502, "num_input_tokens_seen": 412219584, "router_z_loss_mlp": 0.08685303, "routerloss_mlp": 0.0, "step": 4974, "time_per_iteration": 2.4901206493377686 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047752, "balance_loss_mlp": 1.03915691, "diversity_loss_mlp": 0.0, "epoch": 0.9570988841862255, "flos": 589959853056.0, "grad_norm": 0.0658884479140285, "language_loss": 0.78595436, "learning_rate": 4.818867211936806e-06, "loss": 0.7964319, "num_input_tokens_seen": 412295088, "router_z_loss_mlp": 0.08605957, "routerloss_mlp": 0.0, "step": 4975, "time_per_iteration": 4.219155550003052 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043882, "balance_loss_mlp": 1.03510857, "diversity_loss_mlp": 0.0, "epoch": 0.957291265871489, "flos": 767278448640.0, "grad_norm": 0.07813154083214305, "language_loss": 0.78541613, "learning_rate": 4.7758143535454045e-06, "loss": 0.79585493, "num_input_tokens_seen": 412376992, "router_z_loss_mlp": 0.08782959, "routerloss_mlp": 0.0, "step": 4976, "time_per_iteration": 2.9422388076782227 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045842, "balance_loss_mlp": 1.03703845, "diversity_loss_mlp": 0.0, "epoch": 0.9574836475567526, "flos": 639104670720.0, "grad_norm": 0.07237747383924455, "language_loss": 0.84659564, "learning_rate": 4.732953758233849e-06, "loss": 0.85705405, "num_input_tokens_seen": 412450064, "router_z_loss_mlp": 0.08813477, "routerloss_mlp": 0.0, "step": 4977, "time_per_iteration": 2.826688528060913 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01004691, "balance_loss_mlp": 1.0002805, "diversity_loss_mlp": 0.0, "epoch": 0.9576760292420161, "flos": 1575939649536.0, "grad_norm": 0.006664188824760945, "language_loss": 0.78607261, "learning_rate": 4.690285442642272e-06, "loss": 0.79611945, "num_input_tokens_seen": 412676896, "router_z_loss_mlp": 0.04418945, "routerloss_mlp": 0.0, "step": 4978, "time_per_iteration": 4.937689781188965 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0078841, "balance_loss_mlp": 1.33186579, "diversity_loss_mlp": 0.22349364, "epoch": 0.9578684109272797, "flos": 496345439232.0, "grad_norm": 0.030270093123026424, "language_loss": 0.87261242, "learning_rate": 4.6478094233358695e-06, "loss": 0.8804965, "num_input_tokens_seen": 412746848, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01073015, "step": 4979, "time_per_iteration": 2.6448476314544678 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00787724, "balance_loss_mlp": 1.330446, "diversity_loss_mlp": 0.2238563, "epoch": 0.9580607926125433, "flos": 429954531840.0, "grad_norm": 0.03851656500602482, "language_loss": 0.85486841, "learning_rate": 4.605525716805337e-06, "loss": 0.86274564, "num_input_tokens_seen": 412810144, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0105729, "step": 4980, "time_per_iteration": 2.513583183288574 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048489, "balance_loss_mlp": 1.03938699, "diversity_loss_mlp": 0.0, "epoch": 0.9582531742978069, "flos": 1127262251520.0, "grad_norm": 0.0738676496011813, "language_loss": 0.80298102, "learning_rate": 4.563434339466599e-06, "loss": 0.81346583, "num_input_tokens_seen": 412904768, "router_z_loss_mlp": 0.09106445, "routerloss_mlp": 0.0, "step": 4981, "time_per_iteration": 3.532383441925049 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048146, "balance_loss_mlp": 1.03933644, "diversity_loss_mlp": 0.0, "epoch": 0.9584455559830705, "flos": 524458395648.0, "grad_norm": 0.05859325637714088, "language_loss": 0.79110616, "learning_rate": 4.521535307661085e-06, "loss": 0.80158764, "num_input_tokens_seen": 412974592, "router_z_loss_mlp": 0.0881958, "routerloss_mlp": 0.0, "step": 4982, "time_per_iteration": 2.6554603576660156 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048551, "balance_loss_mlp": 1.03964579, "diversity_loss_mlp": 0.0, "epoch": 0.9586379376683339, "flos": 634187543040.0, "grad_norm": 0.05822993259734132, "language_loss": 0.81000149, "learning_rate": 4.479828637655392e-06, "loss": 0.82048702, "num_input_tokens_seen": 413052848, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 4983, "time_per_iteration": 2.836662530899048 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045539, "balance_loss_mlp": 1.03656244, "diversity_loss_mlp": 0.0, "epoch": 0.9588303193535975, "flos": 416061038592.0, "grad_norm": 0.06921858371067632, "language_loss": 0.83688623, "learning_rate": 4.438314345641459e-06, "loss": 0.84734166, "num_input_tokens_seen": 413118000, "router_z_loss_mlp": 0.08978271, "routerloss_mlp": 0.0, "step": 4984, "time_per_iteration": 2.4890353679656982 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047663, "balance_loss_mlp": 1.03846598, "diversity_loss_mlp": 0.0, "epoch": 0.9590227010388611, "flos": 481683635712.0, "grad_norm": 0.0655069361339347, "language_loss": 0.78102469, "learning_rate": 4.3969924477365585e-06, "loss": 0.79150128, "num_input_tokens_seen": 413185616, "router_z_loss_mlp": 0.09204102, "routerloss_mlp": 0.0, "step": 4985, "time_per_iteration": 2.5810418128967285 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046293, "balance_loss_mlp": 1.03757238, "diversity_loss_mlp": 0.0, "epoch": 0.9592150827241247, "flos": 684540440064.0, "grad_norm": 0.0696645623460603, "language_loss": 0.80404431, "learning_rate": 4.355862959983359e-06, "loss": 0.81450725, "num_input_tokens_seen": 413265616, "router_z_loss_mlp": 0.08721924, "routerloss_mlp": 0.0, "step": 4986, "time_per_iteration": 3.0027694702148438 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044808, "balance_loss_mlp": 1.03609419, "diversity_loss_mlp": 0.0, "epoch": 0.9594074644093882, "flos": 574490092032.0, "grad_norm": 0.06168953583598696, "language_loss": 0.70886958, "learning_rate": 4.314925898349642e-06, "loss": 0.71931762, "num_input_tokens_seen": 413341248, "router_z_loss_mlp": 0.08721924, "routerloss_mlp": 0.0, "step": 4987, "time_per_iteration": 2.7255663871765137 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046824, "balance_loss_mlp": 1.03819966, "diversity_loss_mlp": 0.0, "epoch": 0.9595998460946518, "flos": 546871233024.0, "grad_norm": 0.0653725751798929, "language_loss": 0.78369594, "learning_rate": 4.2741812787286395e-06, "loss": 0.79416412, "num_input_tokens_seen": 413416080, "router_z_loss_mlp": 0.08636475, "routerloss_mlp": 0.0, "step": 4988, "time_per_iteration": 2.7598073482513428 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01042023, "balance_loss_mlp": 1.03311229, "diversity_loss_mlp": 0.0, "epoch": 0.9597922277799154, "flos": 474043829760.0, "grad_norm": 0.07692135244194774, "language_loss": 0.78684759, "learning_rate": 4.233629116938809e-06, "loss": 0.79726779, "num_input_tokens_seen": 413482336, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 4989, "time_per_iteration": 2.5303213596343994 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047607, "balance_loss_mlp": 1.03871953, "diversity_loss_mlp": 0.0, "epoch": 0.9599846094651789, "flos": 514691324928.0, "grad_norm": 0.08379738751426644, "language_loss": 0.85613489, "learning_rate": 4.193269428723889e-06, "loss": 0.866611, "num_input_tokens_seen": 413553248, "router_z_loss_mlp": 0.08898926, "routerloss_mlp": 0.0, "step": 4990, "time_per_iteration": 2.614570379257202 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046822, "balance_loss_mlp": 1.03815556, "diversity_loss_mlp": 0.0, "epoch": 0.9601769911504425, "flos": 594983066112.0, "grad_norm": 0.08435652614677631, "language_loss": 0.78316408, "learning_rate": 4.1531022297529035e-06, "loss": 0.79363227, "num_input_tokens_seen": 413625776, "router_z_loss_mlp": 0.08679199, "routerloss_mlp": 0.0, "step": 4991, "time_per_iteration": 2.748410224914551 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104591, "balance_loss_mlp": 1.03710628, "diversity_loss_mlp": 0.0, "epoch": 0.960369372835706, "flos": 493012293120.0, "grad_norm": 0.06666949415129908, "language_loss": 0.79405791, "learning_rate": 4.1131275356201536e-06, "loss": 0.80451697, "num_input_tokens_seen": 413693056, "router_z_loss_mlp": 0.08807373, "routerloss_mlp": 0.0, "step": 4992, "time_per_iteration": 2.6129846572875977 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049094, "balance_loss_mlp": 1.04027796, "diversity_loss_mlp": 0.0, "epoch": 0.9605617545209696, "flos": 579293420544.0, "grad_norm": 0.06505303405528073, "language_loss": 0.82855588, "learning_rate": 4.073345361845171e-06, "loss": 0.83904684, "num_input_tokens_seen": 413765616, "router_z_loss_mlp": 0.0881958, "routerloss_mlp": 0.0, "step": 4993, "time_per_iteration": 2.697122097015381 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048804, "balance_loss_mlp": 1.03996491, "diversity_loss_mlp": 0.0, "epoch": 0.9607541362062332, "flos": 927708857856.0, "grad_norm": 0.05557800406655289, "language_loss": 0.86002243, "learning_rate": 4.033755723872767e-06, "loss": 0.87051046, "num_input_tokens_seen": 413850976, "router_z_loss_mlp": 0.08850098, "routerloss_mlp": 0.0, "step": 4994, "time_per_iteration": 3.2234411239624023 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049247, "balance_loss_mlp": 1.04041374, "diversity_loss_mlp": 0.0, "epoch": 0.9609465178914968, "flos": 573121225728.0, "grad_norm": 0.05698113601966363, "language_loss": 0.75638676, "learning_rate": 3.994358637073036e-06, "loss": 0.7668792, "num_input_tokens_seen": 413931648, "router_z_loss_mlp": 0.08831787, "routerloss_mlp": 0.0, "step": 4995, "time_per_iteration": 2.811509847640991 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047757, "balance_loss_mlp": 1.03900671, "diversity_loss_mlp": 0.0, "epoch": 0.9611388995767602, "flos": 530850475008.0, "grad_norm": 0.06182635414067332, "language_loss": 0.85539091, "learning_rate": 3.955154116741244e-06, "loss": 0.86586845, "num_input_tokens_seen": 414003216, "router_z_loss_mlp": 0.08758545, "routerloss_mlp": 0.0, "step": 4996, "time_per_iteration": 2.6234097480773926 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00776504, "balance_loss_mlp": 1.30815172, "diversity_loss_mlp": 0.22351003, "epoch": 0.9613312812620238, "flos": 646247808000.0, "grad_norm": 0.03585301103792293, "language_loss": 0.82592523, "learning_rate": 3.916142178097881e-06, "loss": 0.83369029, "num_input_tokens_seen": 414077072, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01067326, "step": 4997, "time_per_iteration": 2.7915287017822266 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0077909, "balance_loss_mlp": 1.31180668, "diversity_loss_mlp": 0.22519468, "epoch": 0.9615236629472874, "flos": 496152718848.0, "grad_norm": 0.032099715647482555, "language_loss": 0.77762806, "learning_rate": 3.877322836288888e-06, "loss": 0.78541887, "num_input_tokens_seen": 414157600, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0105895, "step": 4998, "time_per_iteration": 2.8831381797790527 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045673, "balance_loss_mlp": 1.03671455, "diversity_loss_mlp": 0.0, "epoch": 0.961716044632551, "flos": 512974093824.0, "grad_norm": 0.0659062812504805, "language_loss": 0.75562751, "learning_rate": 3.838696106385153e-06, "loss": 0.76608419, "num_input_tokens_seen": 414224880, "router_z_loss_mlp": 0.08959961, "routerloss_mlp": 0.0, "step": 4999, "time_per_iteration": 2.5965874195098877 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049929, "balance_loss_mlp": 1.0409348, "diversity_loss_mlp": 0.0, "epoch": 0.9619084263178146, "flos": 501084527616.0, "grad_norm": 0.06697543006955084, "language_loss": 0.80806673, "learning_rate": 3.800262003382904e-06, "loss": 0.81856602, "num_input_tokens_seen": 414291728, "router_z_loss_mlp": 0.09002686, "routerloss_mlp": 0.0, "step": 5000, "time_per_iteration": 2.5651276111602783 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045606, "balance_loss_mlp": 1.0366478, "diversity_loss_mlp": 0.0, "epoch": 0.9621008080030781, "flos": 595635379200.0, "grad_norm": 0.0765647536824451, "language_loss": 0.75030309, "learning_rate": 3.7620205422035923e-06, "loss": 0.76075912, "num_input_tokens_seen": 414369568, "router_z_loss_mlp": 0.08959961, "routerloss_mlp": 0.0, "step": 5001, "time_per_iteration": 2.750175952911377 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048226, "balance_loss_mlp": 1.03932738, "diversity_loss_mlp": 0.0, "epoch": 0.9622931896883417, "flos": 502250761728.0, "grad_norm": 0.07727900973651224, "language_loss": 0.81910348, "learning_rate": 3.723971737693899e-06, "loss": 0.82958579, "num_input_tokens_seen": 414441424, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 5002, "time_per_iteration": 2.665245294570923 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048413, "balance_loss_mlp": 1.03946078, "diversity_loss_mlp": 0.0, "epoch": 0.9624855713736052, "flos": 607287808512.0, "grad_norm": 0.0718035222006464, "language_loss": 0.80944788, "learning_rate": 3.6861156046256728e-06, "loss": 0.81993198, "num_input_tokens_seen": 414512960, "router_z_loss_mlp": 0.08959961, "routerloss_mlp": 0.0, "step": 5003, "time_per_iteration": 2.7820627689361572 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047459, "balance_loss_mlp": 1.03892946, "diversity_loss_mlp": 0.0, "epoch": 0.9626779530588688, "flos": 510715777536.0, "grad_norm": 0.09658490174394786, "language_loss": 0.85061997, "learning_rate": 3.648452157695936e-06, "loss": 0.86109459, "num_input_tokens_seen": 414577392, "router_z_loss_mlp": 0.08538818, "routerloss_mlp": 0.0, "step": 5004, "time_per_iteration": 2.5650572776794434 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051022, "balance_loss_mlp": 1.04228425, "diversity_loss_mlp": 0.0, "epoch": 0.9628703347441323, "flos": 627294025728.0, "grad_norm": 0.07079516660765435, "language_loss": 0.82573175, "learning_rate": 3.610981411526937e-06, "loss": 0.83624196, "num_input_tokens_seen": 414655152, "router_z_loss_mlp": 0.08752441, "routerloss_mlp": 0.0, "step": 5005, "time_per_iteration": 2.808318853378296 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01054525, "balance_loss_mlp": 1.04536355, "diversity_loss_mlp": 0.0, "epoch": 0.9630627164293959, "flos": 630758223360.0, "grad_norm": 0.06358415598016834, "language_loss": 0.77436566, "learning_rate": 3.573703380666149e-06, "loss": 0.78491098, "num_input_tokens_seen": 414730432, "router_z_loss_mlp": 0.09161377, "routerloss_mlp": 0.0, "step": 5006, "time_per_iteration": 2.7581474781036377 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046156, "balance_loss_mlp": 1.0372808, "diversity_loss_mlp": 0.0, "epoch": 0.9632550981146595, "flos": 570558961152.0, "grad_norm": 0.06259715736563402, "language_loss": 0.78214157, "learning_rate": 3.5366180795861622e-06, "loss": 0.79260308, "num_input_tokens_seen": 414810688, "router_z_loss_mlp": 0.08880615, "routerloss_mlp": 0.0, "step": 5007, "time_per_iteration": 2.8067400455474854 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047305, "balance_loss_mlp": 1.03849518, "diversity_loss_mlp": 0.0, "epoch": 0.9634474797999231, "flos": 466117327872.0, "grad_norm": 0.0652004870167461, "language_loss": 0.8097052, "learning_rate": 3.4997255226847937e-06, "loss": 0.82017827, "num_input_tokens_seen": 414880544, "router_z_loss_mlp": 0.0881958, "routerloss_mlp": 0.0, "step": 5008, "time_per_iteration": 2.6624722480773926 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043775, "balance_loss_mlp": 1.03475678, "diversity_loss_mlp": 0.0, "epoch": 0.9636398614851867, "flos": 526600714752.0, "grad_norm": 0.07542594197578673, "language_loss": 0.85320652, "learning_rate": 3.463025724284974e-06, "loss": 0.8636443, "num_input_tokens_seen": 414949920, "router_z_loss_mlp": 0.09020996, "routerloss_mlp": 0.0, "step": 5009, "time_per_iteration": 2.649427890777588 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044726, "balance_loss_mlp": 1.03576136, "diversity_loss_mlp": 0.0, "epoch": 0.9638322431704501, "flos": 564831677952.0, "grad_norm": 0.06511821335900564, "language_loss": 0.75133872, "learning_rate": 3.4265186986348618e-06, "loss": 0.76178598, "num_input_tokens_seen": 415024288, "router_z_loss_mlp": 0.08966064, "routerloss_mlp": 0.0, "step": 5010, "time_per_iteration": 2.780074119567871 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046262, "balance_loss_mlp": 1.03736854, "diversity_loss_mlp": 0.0, "epoch": 0.9640246248557137, "flos": 477772328448.0, "grad_norm": 0.07329288404167861, "language_loss": 0.84246582, "learning_rate": 3.3902044599076754e-06, "loss": 0.8529284, "num_input_tokens_seen": 415092032, "router_z_loss_mlp": 0.08892822, "routerloss_mlp": 0.0, "step": 5011, "time_per_iteration": 2.651488780975342 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047189, "balance_loss_mlp": 1.03848636, "diversity_loss_mlp": 0.0, "epoch": 0.9642170065409773, "flos": 539318062080.0, "grad_norm": 0.06680869041289342, "language_loss": 0.88673419, "learning_rate": 3.354083022201859e-06, "loss": 0.89720607, "num_input_tokens_seen": 415158544, "router_z_loss_mlp": 0.08709717, "routerloss_mlp": 0.0, "step": 5012, "time_per_iteration": 2.6489691734313965 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046396, "balance_loss_mlp": 1.03752685, "diversity_loss_mlp": 0.0, "epoch": 0.9644093882262409, "flos": 523754325504.0, "grad_norm": 0.06514803880345414, "language_loss": 0.83791411, "learning_rate": 3.3181543995410843e-06, "loss": 0.848378, "num_input_tokens_seen": 415225088, "router_z_loss_mlp": 0.08868408, "routerloss_mlp": 0.0, "step": 5013, "time_per_iteration": 2.57792067527771 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046587, "balance_loss_mlp": 1.03800964, "diversity_loss_mlp": 0.0, "epoch": 0.9646017699115044, "flos": 574290031104.0, "grad_norm": 0.06277044595718272, "language_loss": 0.78603232, "learning_rate": 3.2824186058740268e-06, "loss": 0.79649818, "num_input_tokens_seen": 415300224, "router_z_loss_mlp": 0.08587646, "routerloss_mlp": 0.0, "step": 5014, "time_per_iteration": 2.75705885887146 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049, "balance_loss_mlp": 1.04026842, "diversity_loss_mlp": 0.0, "epoch": 0.964794151596768, "flos": 636799366656.0, "grad_norm": 0.10341285482454692, "language_loss": 0.84443051, "learning_rate": 3.246875655074588e-06, "loss": 0.85492051, "num_input_tokens_seen": 415368784, "router_z_loss_mlp": 0.08740234, "routerloss_mlp": 0.0, "step": 5015, "time_per_iteration": 2.7894856929779053 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104746, "balance_loss_mlp": 1.03886533, "diversity_loss_mlp": 0.0, "epoch": 0.9649865332820315, "flos": 617435550720.0, "grad_norm": 0.07303173278488923, "language_loss": 0.86459041, "learning_rate": 3.211525560941675e-06, "loss": 0.87506503, "num_input_tokens_seen": 415440752, "router_z_loss_mlp": 0.08605957, "routerloss_mlp": 0.0, "step": 5016, "time_per_iteration": 2.774505376815796 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045328, "balance_loss_mlp": 1.036268, "diversity_loss_mlp": 0.0, "epoch": 0.9651789149672951, "flos": 516183528960.0, "grad_norm": 0.06203977251445547, "language_loss": 0.81297398, "learning_rate": 3.1763683371994754e-06, "loss": 0.82342726, "num_input_tokens_seen": 415516128, "router_z_loss_mlp": 0.09063721, "routerloss_mlp": 0.0, "step": 5017, "time_per_iteration": 2.7457613945007324 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045833, "balance_loss_mlp": 1.0369395, "diversity_loss_mlp": 0.0, "epoch": 0.9653712966525587, "flos": 492940712448.0, "grad_norm": 0.07389028070446926, "language_loss": 0.80003834, "learning_rate": 3.1414039974972385e-06, "loss": 0.81049669, "num_input_tokens_seen": 415583744, "router_z_loss_mlp": 0.08905029, "routerloss_mlp": 0.0, "step": 5018, "time_per_iteration": 2.5559167861938477 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047384, "balance_loss_mlp": 1.03835368, "diversity_loss_mlp": 0.0, "epoch": 0.9655636783378222, "flos": 536560505856.0, "grad_norm": 0.05876051048061586, "language_loss": 0.82367206, "learning_rate": 3.106632555409328e-06, "loss": 0.83414584, "num_input_tokens_seen": 415659856, "router_z_loss_mlp": 0.09033203, "routerloss_mlp": 0.0, "step": 5019, "time_per_iteration": 2.7295024394989014 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048475, "balance_loss_mlp": 1.03974926, "diversity_loss_mlp": 0.0, "epoch": 0.9657560600230858, "flos": 459023749632.0, "grad_norm": 0.0742577236438263, "language_loss": 0.82501537, "learning_rate": 3.072054024435167e-06, "loss": 0.83550012, "num_input_tokens_seen": 415731792, "router_z_loss_mlp": 0.08740234, "routerloss_mlp": 0.0, "step": 5020, "time_per_iteration": 2.732282876968384 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048667, "balance_loss_mlp": 1.0402087, "diversity_loss_mlp": 0.0, "epoch": 0.9659484417083494, "flos": 686178749952.0, "grad_norm": 0.07257927833574024, "language_loss": 0.83663607, "learning_rate": 3.0376684179994064e-06, "loss": 0.84712267, "num_input_tokens_seen": 415809536, "router_z_loss_mlp": 0.08465576, "routerloss_mlp": 0.0, "step": 5021, "time_per_iteration": 2.8645994663238525 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01003265, "balance_loss_mlp": 0.99885476, "diversity_loss_mlp": 0.0, "epoch": 0.966140823393613, "flos": 1502292178944.0, "grad_norm": 0.004502170891661989, "language_loss": 0.80694246, "learning_rate": 3.0034757494516453e-06, "loss": 0.81697512, "num_input_tokens_seen": 416027600, "router_z_loss_mlp": 0.04418945, "routerloss_mlp": 0.0, "step": 5022, "time_per_iteration": 4.703518390655518 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049998, "balance_loss_mlp": 1.04122436, "diversity_loss_mlp": 0.0, "epoch": 0.9663332050788765, "flos": 464899336704.0, "grad_norm": 0.08988904326994861, "language_loss": 0.81278229, "learning_rate": 2.9694760320667093e-06, "loss": 0.82328224, "num_input_tokens_seen": 416096128, "router_z_loss_mlp": 0.08782959, "routerloss_mlp": 0.0, "step": 5023, "time_per_iteration": 2.581846237182617 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010486, "balance_loss_mlp": 1.03996289, "diversity_loss_mlp": 0.0, "epoch": 0.96652558676414, "flos": 500834907648.0, "grad_norm": 0.07024301133900458, "language_loss": 0.85463035, "learning_rate": 2.9356692790444283e-06, "loss": 0.86511636, "num_input_tokens_seen": 416164256, "router_z_loss_mlp": 0.08648682, "routerloss_mlp": 0.0, "step": 5024, "time_per_iteration": 2.6678829193115234 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047106, "balance_loss_mlp": 1.03803396, "diversity_loss_mlp": 0.0, "epoch": 0.9667179684494036, "flos": 424839914496.0, "grad_norm": 0.0827162063613028, "language_loss": 0.82914466, "learning_rate": 2.9020555035097484e-06, "loss": 0.8396157, "num_input_tokens_seen": 416227296, "router_z_loss_mlp": 0.09075928, "routerloss_mlp": 0.0, "step": 5025, "time_per_iteration": 2.4615111351013184 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047117, "balance_loss_mlp": 1.03826559, "diversity_loss_mlp": 0.0, "epoch": 0.9669103501346672, "flos": 516996628992.0, "grad_norm": 0.061914921870518225, "language_loss": 0.85848838, "learning_rate": 2.8686347185127305e-06, "loss": 0.86895955, "num_input_tokens_seen": 416297184, "router_z_loss_mlp": 0.08856201, "routerloss_mlp": 0.0, "step": 5026, "time_per_iteration": 2.6631765365600586 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045557, "balance_loss_mlp": 1.03661585, "diversity_loss_mlp": 0.0, "epoch": 0.9671027318199308, "flos": 456241600512.0, "grad_norm": 0.10389844527854888, "language_loss": 0.75783134, "learning_rate": 2.8354069370284396e-06, "loss": 0.76828694, "num_input_tokens_seen": 416363056, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 5027, "time_per_iteration": 2.6192245483398438 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104253, "balance_loss_mlp": 1.03396487, "diversity_loss_mlp": 0.0, "epoch": 0.9672951135051943, "flos": 525058951680.0, "grad_norm": 0.06651584976337663, "language_loss": 0.80529153, "learning_rate": 2.802372171957057e-06, "loss": 0.8157168, "num_input_tokens_seen": 416430688, "router_z_loss_mlp": 0.08575439, "routerloss_mlp": 0.0, "step": 5028, "time_per_iteration": 2.6251182556152344 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047556, "balance_loss_mlp": 1.03856707, "diversity_loss_mlp": 0.0, "epoch": 0.9674874951904578, "flos": 573986082816.0, "grad_norm": 0.06722764033814799, "language_loss": 0.79839933, "learning_rate": 2.7695304361237682e-06, "loss": 0.80887485, "num_input_tokens_seen": 416505248, "router_z_loss_mlp": 0.08990479, "routerloss_mlp": 0.0, "step": 5029, "time_per_iteration": 2.7434749603271484 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104763, "balance_loss_mlp": 1.03893399, "diversity_loss_mlp": 0.0, "epoch": 0.9676798768757214, "flos": 629184153600.0, "grad_norm": 0.06316563947076154, "language_loss": 0.80004889, "learning_rate": 2.7368817422789848e-06, "loss": 0.81052518, "num_input_tokens_seen": 416592640, "router_z_loss_mlp": 0.08703613, "routerloss_mlp": 0.0, "step": 5030, "time_per_iteration": 2.9535553455352783 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01003251, "balance_loss_mlp": 0.99884009, "diversity_loss_mlp": 0.0, "epoch": 0.967872258560985, "flos": 1463880605184.0, "grad_norm": 0.004505813137803552, "language_loss": 0.75563359, "learning_rate": 2.7044261030979566e-06, "loss": 0.76566613, "num_input_tokens_seen": 416808560, "router_z_loss_mlp": 0.04418945, "routerloss_mlp": 0.0, "step": 5031, "time_per_iteration": 4.665933609008789 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049308, "balance_loss_mlp": 1.04061723, "diversity_loss_mlp": 0.0, "epoch": 0.9680646402462486, "flos": 565503814656.0, "grad_norm": 0.07437893126618236, "language_loss": 0.79223692, "learning_rate": 2.672163531181049e-06, "loss": 0.80272996, "num_input_tokens_seen": 416878208, "router_z_loss_mlp": 0.0869751, "routerloss_mlp": 0.0, "step": 5032, "time_per_iteration": 2.6745200157165527 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01003248, "balance_loss_mlp": 0.99883741, "diversity_loss_mlp": 0.0, "epoch": 0.9682570219315121, "flos": 1434463022592.0, "grad_norm": 0.004505868190554417, "language_loss": 0.78074801, "learning_rate": 2.6400940390537976e-06, "loss": 0.79078054, "num_input_tokens_seen": 417105968, "router_z_loss_mlp": 0.04418945, "routerloss_mlp": 0.0, "step": 5033, "time_per_iteration": 4.830533027648926 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043814, "balance_loss_mlp": 1.03500438, "diversity_loss_mlp": 0.0, "epoch": 0.9684494036167757, "flos": 584610670080.0, "grad_norm": 0.07679444902591688, "language_loss": 0.81878042, "learning_rate": 2.608217639166688e-06, "loss": 0.82921857, "num_input_tokens_seen": 417175168, "router_z_loss_mlp": 0.08813477, "routerloss_mlp": 0.0, "step": 5034, "time_per_iteration": 2.7140636444091797 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048554, "balance_loss_mlp": 1.03991711, "diversity_loss_mlp": 0.0, "epoch": 0.9686417853020393, "flos": 559064747520.0, "grad_norm": 0.06455129167487729, "language_loss": 0.84188414, "learning_rate": 2.5765343438950982e-06, "loss": 0.85236967, "num_input_tokens_seen": 417247760, "router_z_loss_mlp": 0.08642578, "routerloss_mlp": 0.0, "step": 5035, "time_per_iteration": 2.7100539207458496 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048563, "balance_loss_mlp": 1.03969407, "diversity_loss_mlp": 0.0, "epoch": 0.9688341669873028, "flos": 784927604736.0, "grad_norm": 0.07457469088112735, "language_loss": 0.8308925, "learning_rate": 2.545044165539745e-06, "loss": 0.84137809, "num_input_tokens_seen": 417324080, "router_z_loss_mlp": 0.08880615, "routerloss_mlp": 0.0, "step": 5036, "time_per_iteration": 3.0273303985595703 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00775046, "balance_loss_mlp": 1.30442953, "diversity_loss_mlp": 0.22392677, "epoch": 0.9690265486725663, "flos": 395899176960.0, "grad_norm": 0.03634578837356394, "language_loss": 0.79774749, "learning_rate": 2.513747116326126e-06, "loss": 0.805498, "num_input_tokens_seen": 417386416, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01086747, "step": 5037, "time_per_iteration": 2.496250629425049 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046622, "balance_loss_mlp": 1.03794384, "diversity_loss_mlp": 0.0, "epoch": 0.9692189303578299, "flos": 476373726720.0, "grad_norm": 0.07461894486851982, "language_loss": 0.77795297, "learning_rate": 2.4826432084048002e-06, "loss": 0.78841919, "num_input_tokens_seen": 417459648, "router_z_loss_mlp": 0.08685303, "routerloss_mlp": 0.0, "step": 5038, "time_per_iteration": 2.735316753387451 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046103, "balance_loss_mlp": 1.03756189, "diversity_loss_mlp": 0.0, "epoch": 0.9694113120430935, "flos": 597575066112.0, "grad_norm": 0.07661744515255002, "language_loss": 0.79197067, "learning_rate": 2.451732453851385e-06, "loss": 0.8024317, "num_input_tokens_seen": 417530512, "router_z_loss_mlp": 0.08551025, "routerloss_mlp": 0.0, "step": 5039, "time_per_iteration": 2.7147159576416016 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043927, "balance_loss_mlp": 1.03520727, "diversity_loss_mlp": 0.0, "epoch": 0.9696036937283571, "flos": 500881895424.0, "grad_norm": 0.06459150402718168, "language_loss": 0.82762325, "learning_rate": 2.4210148646665598e-06, "loss": 0.83806252, "num_input_tokens_seen": 417597600, "router_z_loss_mlp": 0.08728027, "routerloss_mlp": 0.0, "step": 5040, "time_per_iteration": 2.5953493118286133 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043708, "balance_loss_mlp": 1.03482664, "diversity_loss_mlp": 0.0, "epoch": 0.9697960754136207, "flos": 432277088256.0, "grad_norm": 0.08520160899358113, "language_loss": 0.87077874, "learning_rate": 2.3904904527758952e-06, "loss": 0.88121581, "num_input_tokens_seen": 417659616, "router_z_loss_mlp": 0.08892822, "routerloss_mlp": 0.0, "step": 5041, "time_per_iteration": 2.470695972442627 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047399, "balance_loss_mlp": 1.03847671, "diversity_loss_mlp": 0.0, "epoch": 0.9699884570988841, "flos": 568540353024.0, "grad_norm": 0.0661289335538221, "language_loss": 0.85483861, "learning_rate": 2.3601592300300235e-06, "loss": 0.86531258, "num_input_tokens_seen": 417730896, "router_z_loss_mlp": 0.08929443, "routerloss_mlp": 0.0, "step": 5042, "time_per_iteration": 2.7053682804107666 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104593, "balance_loss_mlp": 1.03708434, "diversity_loss_mlp": 0.0, "epoch": 0.9701808387841477, "flos": 516215835648.0, "grad_norm": 0.06476327659734085, "language_loss": 0.81779778, "learning_rate": 2.33002120820458e-06, "loss": 0.82825708, "num_input_tokens_seen": 417803296, "router_z_loss_mlp": 0.08856201, "routerloss_mlp": 0.0, "step": 5043, "time_per_iteration": 2.6728196144104004 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046757, "balance_loss_mlp": 1.03794777, "diversity_loss_mlp": 0.0, "epoch": 0.9703732204694113, "flos": 491517517824.0, "grad_norm": 0.08267177511200062, "language_loss": 0.76453322, "learning_rate": 2.300076399000206e-06, "loss": 0.77500081, "num_input_tokens_seen": 417870208, "router_z_loss_mlp": 0.08807373, "routerloss_mlp": 0.0, "step": 5044, "time_per_iteration": 2.5768589973449707 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047615, "balance_loss_mlp": 1.03866804, "diversity_loss_mlp": 0.0, "epoch": 0.9705656021546749, "flos": 626120451072.0, "grad_norm": 0.06897516762466789, "language_loss": 0.80167985, "learning_rate": 2.2703248140424348e-06, "loss": 0.81215596, "num_input_tokens_seen": 417944464, "router_z_loss_mlp": 0.08953857, "routerloss_mlp": 0.0, "step": 5045, "time_per_iteration": 2.795342206954956 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045575, "balance_loss_mlp": 1.03677726, "diversity_loss_mlp": 0.0, "epoch": 0.9707579838399384, "flos": 471437148672.0, "grad_norm": 0.0755169004935037, "language_loss": 0.83042562, "learning_rate": 2.2407664648819715e-06, "loss": 0.84088135, "num_input_tokens_seen": 418010480, "router_z_loss_mlp": 0.08807373, "routerloss_mlp": 0.0, "step": 5046, "time_per_iteration": 2.5994091033935547 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046592, "balance_loss_mlp": 1.03778839, "diversity_loss_mlp": 0.0, "epoch": 0.970950365525202, "flos": 492103019520.0, "grad_norm": 0.07013648257820884, "language_loss": 0.80700469, "learning_rate": 2.2114013629942475e-06, "loss": 0.81747067, "num_input_tokens_seen": 418083952, "router_z_loss_mlp": 0.08807373, "routerloss_mlp": 0.0, "step": 5047, "time_per_iteration": 2.695164680480957 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044022, "balance_loss_mlp": 1.03531933, "diversity_loss_mlp": 0.0, "epoch": 0.9711427472104656, "flos": 557322923520.0, "grad_norm": 0.06514840583334829, "language_loss": 0.80631614, "learning_rate": 2.1822295197799213e-06, "loss": 0.81675637, "num_input_tokens_seen": 418156672, "router_z_loss_mlp": 0.0871582, "routerloss_mlp": 0.0, "step": 5048, "time_per_iteration": 2.692713975906372 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049343, "balance_loss_mlp": 1.04089093, "diversity_loss_mlp": 0.0, "epoch": 0.9713351288957291, "flos": 625841095680.0, "grad_norm": 0.06192564808689567, "language_loss": 0.83786458, "learning_rate": 2.153250946564489e-06, "loss": 0.84835804, "num_input_tokens_seen": 418242160, "router_z_loss_mlp": 0.08459473, "routerloss_mlp": 0.0, "step": 5049, "time_per_iteration": 2.934725761413574 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049582, "balance_loss_mlp": 1.04098153, "diversity_loss_mlp": 0.0, "epoch": 0.9715275105809927, "flos": 499073260032.0, "grad_norm": 0.0692175783084948, "language_loss": 0.81435341, "learning_rate": 2.1244656545983397e-06, "loss": 0.82484925, "num_input_tokens_seen": 418316960, "router_z_loss_mlp": 0.08612061, "routerloss_mlp": 0.0, "step": 5050, "time_per_iteration": 2.732560873031616 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047834, "balance_loss_mlp": 1.03919172, "diversity_loss_mlp": 0.0, "epoch": 0.9717198922662562, "flos": 477515367936.0, "grad_norm": 0.07244382675246107, "language_loss": 0.77611834, "learning_rate": 2.0958736550570345e-06, "loss": 0.78659672, "num_input_tokens_seen": 418383888, "router_z_loss_mlp": 0.08648682, "routerloss_mlp": 0.0, "step": 5051, "time_per_iteration": 2.553946018218994 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048661, "balance_loss_mlp": 1.03976798, "diversity_loss_mlp": 0.0, "epoch": 0.9719122739515198, "flos": 553446120960.0, "grad_norm": 0.058871704281843434, "language_loss": 0.78665662, "learning_rate": 2.067474959040916e-06, "loss": 0.79714322, "num_input_tokens_seen": 418453776, "router_z_loss_mlp": 0.08898926, "routerloss_mlp": 0.0, "step": 5052, "time_per_iteration": 2.700554847717285 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104788, "balance_loss_mlp": 1.03924966, "diversity_loss_mlp": 0.0, "epoch": 0.9721046556367834, "flos": 565852179456.0, "grad_norm": 0.06621518812082018, "language_loss": 0.79820377, "learning_rate": 2.0392695775753312e-06, "loss": 0.80868256, "num_input_tokens_seen": 418521984, "router_z_loss_mlp": 0.08630371, "routerloss_mlp": 0.0, "step": 5053, "time_per_iteration": 2.6846559047698975 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048468, "balance_loss_mlp": 1.03977799, "diversity_loss_mlp": 0.0, "epoch": 0.972297037322047, "flos": 560315045376.0, "grad_norm": 0.07341823776686772, "language_loss": 0.78280944, "learning_rate": 2.0112575216105766e-06, "loss": 0.79329413, "num_input_tokens_seen": 418598768, "router_z_loss_mlp": 0.08691406, "routerloss_mlp": 0.0, "step": 5054, "time_per_iteration": 2.773064136505127 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047195, "balance_loss_mlp": 1.03857064, "diversity_loss_mlp": 0.0, "epoch": 0.9724894190073105, "flos": 512440349184.0, "grad_norm": 0.07604483960314544, "language_loss": 0.79561597, "learning_rate": 1.9834388020218974e-06, "loss": 0.80608791, "num_input_tokens_seen": 418670064, "router_z_loss_mlp": 0.08636475, "routerloss_mlp": 0.0, "step": 5055, "time_per_iteration": 2.6578407287597656 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046653, "balance_loss_mlp": 1.03799832, "diversity_loss_mlp": 0.0, "epoch": 0.972681800692574, "flos": 613832961024.0, "grad_norm": 0.0731380618710485, "language_loss": 0.80641949, "learning_rate": 1.9558134296094875e-06, "loss": 0.81688601, "num_input_tokens_seen": 418745216, "router_z_loss_mlp": 0.08666992, "routerloss_mlp": 0.0, "step": 5056, "time_per_iteration": 2.778132438659668 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049506, "balance_loss_mlp": 1.04049361, "diversity_loss_mlp": 0.0, "epoch": 0.9728741823778376, "flos": 833911635456.0, "grad_norm": 0.06341434190577709, "language_loss": 0.83532751, "learning_rate": 1.92838141509849e-06, "loss": 0.84582257, "num_input_tokens_seen": 418824224, "router_z_loss_mlp": 0.09008789, "routerloss_mlp": 0.0, "step": 5057, "time_per_iteration": 3.070535898208618 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104959, "balance_loss_mlp": 1.04053009, "diversity_loss_mlp": 0.0, "epoch": 0.9730665640631012, "flos": 571450982400.0, "grad_norm": 0.06728126412432961, "language_loss": 0.84373492, "learning_rate": 1.9011427691389415e-06, "loss": 0.85423088, "num_input_tokens_seen": 418899712, "router_z_loss_mlp": 0.09057617, "routerloss_mlp": 0.0, "step": 5058, "time_per_iteration": 2.7407948970794678 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01041956, "balance_loss_mlp": 1.03302085, "diversity_loss_mlp": 0.0, "epoch": 0.9732589457483648, "flos": 506520345600.0, "grad_norm": 0.06896959434834592, "language_loss": 0.77172613, "learning_rate": 1.8740975023057715e-06, "loss": 0.78214562, "num_input_tokens_seen": 418964912, "router_z_loss_mlp": 0.0894165, "routerloss_mlp": 0.0, "step": 5059, "time_per_iteration": 2.593101978302002 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045555, "balance_loss_mlp": 1.03695965, "diversity_loss_mlp": 0.0, "epoch": 0.9734513274336283, "flos": 926977623552.0, "grad_norm": 0.06467450172514855, "language_loss": 0.80509335, "learning_rate": 1.84724562509897e-06, "loss": 0.8155489, "num_input_tokens_seen": 419040032, "router_z_loss_mlp": 0.08599854, "routerloss_mlp": 0.0, "step": 5060, "time_per_iteration": 3.130805015563965 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048144, "balance_loss_mlp": 1.03940582, "diversity_loss_mlp": 0.0, "epoch": 0.9736437091188919, "flos": 491930122752.0, "grad_norm": 0.07143647662877724, "language_loss": 0.7819376, "learning_rate": 1.8205871479433089e-06, "loss": 0.79241908, "num_input_tokens_seen": 419112672, "router_z_loss_mlp": 0.08746338, "routerloss_mlp": 0.0, "step": 5061, "time_per_iteration": 2.7030551433563232 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105027, "balance_loss_mlp": 1.04135358, "diversity_loss_mlp": 0.0, "epoch": 0.9738360908041555, "flos": 613321611264.0, "grad_norm": 0.07722158587827427, "language_loss": 0.8399719, "learning_rate": 1.7941220811885096e-06, "loss": 0.8504746, "num_input_tokens_seen": 419183408, "router_z_loss_mlp": 0.08917236, "routerloss_mlp": 0.0, "step": 5062, "time_per_iteration": 2.7250983715057373 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00588666, "balance_loss_mlp": 1.0272553, "diversity_loss_mlp": 0.13149816, "epoch": 0.974028472489419, "flos": 1549561549824.0, "grad_norm": 0.001262541739400147, "language_loss": 0.75992095, "learning_rate": 1.7678504351092972e-06, "loss": 0.76580763, "num_input_tokens_seen": 419415472, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.00928975, "step": 5063, "time_per_iteration": 4.984234094619751 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0058866, "balance_loss_mlp": 1.02724576, "diversity_loss_mlp": 0.13149402, "epoch": 0.9742208541746825, "flos": 1411155965952.0, "grad_norm": 0.0012626586872862898, "language_loss": 0.79677713, "learning_rate": 1.7417722199051245e-06, "loss": 0.8026638, "num_input_tokens_seen": 419651840, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.00929021, "step": 5064, "time_per_iteration": 4.959820032119751 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043111, "balance_loss_mlp": 1.03426552, "diversity_loss_mlp": 0.0, "epoch": 0.9744132358599461, "flos": 674884597248.0, "grad_norm": 0.061567595116442546, "language_loss": 0.76945543, "learning_rate": 1.7158874457005592e-06, "loss": 0.77988654, "num_input_tokens_seen": 419729424, "router_z_loss_mlp": 0.08850098, "routerloss_mlp": 0.0, "step": 5065, "time_per_iteration": 2.8605847358703613 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046536, "balance_loss_mlp": 1.03767872, "diversity_loss_mlp": 0.0, "epoch": 0.9746056175452097, "flos": 598407616512.0, "grad_norm": 0.06408228412896971, "language_loss": 0.77837121, "learning_rate": 1.690196122544896e-06, "loss": 0.78883654, "num_input_tokens_seen": 419803616, "router_z_loss_mlp": 0.08868408, "routerloss_mlp": 0.0, "step": 5066, "time_per_iteration": 2.8428735733032227 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051292, "balance_loss_mlp": 1.04271507, "diversity_loss_mlp": 0.0, "epoch": 0.9747979992304733, "flos": 732175428096.0, "grad_norm": 0.06431524577835049, "language_loss": 0.82438833, "learning_rate": 1.6646982604123784e-06, "loss": 0.83490127, "num_input_tokens_seen": 419883536, "router_z_loss_mlp": 0.08581543, "routerloss_mlp": 0.0, "step": 5067, "time_per_iteration": 2.9748458862304688 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046087, "balance_loss_mlp": 1.03706264, "diversity_loss_mlp": 0.0, "epoch": 0.9749903809157369, "flos": 616499112960.0, "grad_norm": 0.07892101071391965, "language_loss": 0.76234651, "learning_rate": 1.6393938692022548e-06, "loss": 0.7728073, "num_input_tokens_seen": 419956816, "router_z_loss_mlp": 0.090271, "routerloss_mlp": 0.0, "step": 5068, "time_per_iteration": 2.720424175262451 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049533, "balance_loss_mlp": 1.04073572, "diversity_loss_mlp": 0.0, "epoch": 0.9751827626010003, "flos": 468398039040.0, "grad_norm": 0.06592156995071553, "language_loss": 0.84109974, "learning_rate": 1.6142829587384443e-06, "loss": 0.85159504, "num_input_tokens_seen": 420022096, "router_z_loss_mlp": 0.08813477, "routerloss_mlp": 0.0, "step": 5069, "time_per_iteration": 2.5736031532287598 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048842, "balance_loss_mlp": 1.03985965, "diversity_loss_mlp": 0.0, "epoch": 0.9753751442862639, "flos": 599215574016.0, "grad_norm": 0.08190997494854581, "language_loss": 0.85377657, "learning_rate": 1.5893655387698713e-06, "loss": 0.86426497, "num_input_tokens_seen": 420097008, "router_z_loss_mlp": 0.08990479, "routerloss_mlp": 0.0, "step": 5070, "time_per_iteration": 2.8101613521575928 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049289, "balance_loss_mlp": 1.04077792, "diversity_loss_mlp": 0.0, "epoch": 0.9755675259715275, "flos": 650806285824.0, "grad_norm": 0.0795575480548678, "language_loss": 0.82202387, "learning_rate": 1.5646416189704637e-06, "loss": 0.83251673, "num_input_tokens_seen": 420174960, "router_z_loss_mlp": 0.08514404, "routerloss_mlp": 0.0, "step": 5071, "time_per_iteration": 2.890133857727051 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047822, "balance_loss_mlp": 1.03918517, "diversity_loss_mlp": 0.0, "epoch": 0.9757599076567911, "flos": 563658103296.0, "grad_norm": 0.08438970561016089, "language_loss": 0.79632509, "learning_rate": 1.5401112089387659e-06, "loss": 0.80680329, "num_input_tokens_seen": 420245248, "router_z_loss_mlp": 0.08642578, "routerloss_mlp": 0.0, "step": 5072, "time_per_iteration": 2.678088426589966 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044524, "balance_loss_mlp": 1.03586388, "diversity_loss_mlp": 0.0, "epoch": 0.9759522893420547, "flos": 504637558272.0, "grad_norm": 0.07402137285679701, "language_loss": 0.80289578, "learning_rate": 1.5157743181983819e-06, "loss": 0.81334102, "num_input_tokens_seen": 420310688, "router_z_loss_mlp": 0.08673096, "routerloss_mlp": 0.0, "step": 5073, "time_per_iteration": 2.5970799922943115 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048341, "balance_loss_mlp": 1.03980589, "diversity_loss_mlp": 0.0, "epoch": 0.9761446710273182, "flos": 583728560640.0, "grad_norm": 0.07471313714776352, "language_loss": 0.82160485, "learning_rate": 1.4916309561976982e-06, "loss": 0.83208829, "num_input_tokens_seen": 420379008, "router_z_loss_mlp": 0.08544922, "routerloss_mlp": 0.0, "step": 5074, "time_per_iteration": 2.724550724029541 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047011, "balance_loss_mlp": 1.03809404, "diversity_loss_mlp": 0.0, "epoch": 0.9763370527125818, "flos": 482207468544.0, "grad_norm": 0.07314052432610715, "language_loss": 0.81910318, "learning_rate": 1.4676811323099947e-06, "loss": 0.82957333, "num_input_tokens_seen": 420445504, "router_z_loss_mlp": 0.08917236, "routerloss_mlp": 0.0, "step": 5075, "time_per_iteration": 2.6065866947174072 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047284, "balance_loss_mlp": 1.03871298, "diversity_loss_mlp": 0.0, "epoch": 0.9765294343978453, "flos": 618987225600.0, "grad_norm": 0.06220869349054033, "language_loss": 0.78624564, "learning_rate": 1.4439248558335561e-06, "loss": 0.79671854, "num_input_tokens_seen": 420520528, "router_z_loss_mlp": 0.08575439, "routerloss_mlp": 0.0, "step": 5076, "time_per_iteration": 2.7079405784606934 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00791822, "balance_loss_mlp": 1.33702493, "diversity_loss_mlp": 0.22525913, "epoch": 0.9767218160831089, "flos": 526573550592.0, "grad_norm": 0.034551396825965836, "language_loss": 0.85462183, "learning_rate": 1.4203621359911712e-06, "loss": 0.86254001, "num_input_tokens_seen": 420586224, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01067994, "step": 5077, "time_per_iteration": 2.641120195388794 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044614, "balance_loss_mlp": 1.03591776, "diversity_loss_mlp": 0.0, "epoch": 0.9769141977683724, "flos": 525194772480.0, "grad_norm": 0.06272749449600955, "language_loss": 0.84269607, "learning_rate": 1.3969929819308557e-06, "loss": 0.85314226, "num_input_tokens_seen": 420655456, "router_z_loss_mlp": 0.08709717, "routerloss_mlp": 0.0, "step": 5078, "time_per_iteration": 2.6361942291259766 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049536, "balance_loss_mlp": 1.04075623, "diversity_loss_mlp": 0.0, "epoch": 0.977106579453636, "flos": 457615236096.0, "grad_norm": 0.06781093629055318, "language_loss": 0.80375177, "learning_rate": 1.3738174027252416e-06, "loss": 0.81424713, "num_input_tokens_seen": 420733216, "router_z_loss_mlp": 0.08782959, "routerloss_mlp": 0.0, "step": 5079, "time_per_iteration": 2.799654245376587 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047155, "balance_loss_mlp": 1.03826737, "diversity_loss_mlp": 0.0, "epoch": 0.9772989611388996, "flos": 532090861056.0, "grad_norm": 0.07054076117423486, "language_loss": 0.8182112, "learning_rate": 1.3508354073719642e-06, "loss": 0.82868278, "num_input_tokens_seen": 420803376, "router_z_loss_mlp": 0.08892822, "routerloss_mlp": 0.0, "step": 5080, "time_per_iteration": 2.6261301040649414 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046783, "balance_loss_mlp": 1.03806853, "diversity_loss_mlp": 0.0, "epoch": 0.9774913428241632, "flos": 755349235200.0, "grad_norm": 0.08607720599847436, "language_loss": 0.85967886, "learning_rate": 1.3280470047933313e-06, "loss": 0.87014663, "num_input_tokens_seen": 420886256, "router_z_loss_mlp": 0.0871582, "routerloss_mlp": 0.0, "step": 5081, "time_per_iteration": 3.0126025676727295 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0100325, "balance_loss_mlp": 0.99883974, "diversity_loss_mlp": 0.0, "epoch": 0.9776837245094268, "flos": 1554320088576.0, "grad_norm": 0.004504556807133143, "language_loss": 0.78895497, "learning_rate": 1.3054522038366544e-06, "loss": 0.79898739, "num_input_tokens_seen": 421123728, "router_z_loss_mlp": 0.04418945, "routerloss_mlp": 0.0, "step": 5082, "time_per_iteration": 4.989394903182983 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048062, "balance_loss_mlp": 1.03916299, "diversity_loss_mlp": 0.0, "epoch": 0.9778761061946902, "flos": 592534600704.0, "grad_norm": 0.08681180158775233, "language_loss": 0.84184444, "learning_rate": 1.2830510132739725e-06, "loss": 0.85232502, "num_input_tokens_seen": 421192576, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 5083, "time_per_iteration": 2.694652557373047 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049697, "balance_loss_mlp": 1.04097056, "diversity_loss_mlp": 0.0, "epoch": 0.9780684878799538, "flos": 414951704064.0, "grad_norm": 0.06774609280174271, "language_loss": 0.81603408, "learning_rate": 1.2608434418022175e-06, "loss": 0.82653111, "num_input_tokens_seen": 421256272, "router_z_loss_mlp": 0.08734131, "routerloss_mlp": 0.0, "step": 5084, "time_per_iteration": 2.469529151916504 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049134, "balance_loss_mlp": 1.04026437, "diversity_loss_mlp": 0.0, "epoch": 0.9782608695652174, "flos": 568411872768.0, "grad_norm": 0.06648884426689973, "language_loss": 0.84724671, "learning_rate": 1.2388294980431036e-06, "loss": 0.85773802, "num_input_tokens_seen": 421332880, "router_z_loss_mlp": 0.08880615, "routerloss_mlp": 0.0, "step": 5085, "time_per_iteration": 2.7240426540374756 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046379, "balance_loss_mlp": 1.03747988, "diversity_loss_mlp": 0.0, "epoch": 0.978453251250481, "flos": 690472926720.0, "grad_norm": 0.07204518126062733, "language_loss": 0.82956612, "learning_rate": 1.217009190543239e-06, "loss": 0.84002984, "num_input_tokens_seen": 421406160, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 5086, "time_per_iteration": 2.872143507003784 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046648, "balance_loss_mlp": 1.03782678, "diversity_loss_mlp": 0.0, "epoch": 0.9786456329357445, "flos": 502505150976.0, "grad_norm": 0.06206480321158436, "language_loss": 0.77373308, "learning_rate": 1.1953825277740694e-06, "loss": 0.7841996, "num_input_tokens_seen": 421476208, "router_z_loss_mlp": 0.08831787, "routerloss_mlp": 0.0, "step": 5087, "time_per_iteration": 2.6224989891052246 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046355, "balance_loss_mlp": 1.0377537, "diversity_loss_mlp": 0.0, "epoch": 0.9788380146210081, "flos": 863183485440.0, "grad_norm": 0.07890344203678189, "language_loss": 0.80865037, "learning_rate": 1.1739495181317117e-06, "loss": 0.81911391, "num_input_tokens_seen": 421549232, "router_z_loss_mlp": 0.08605957, "routerloss_mlp": 0.0, "step": 5088, "time_per_iteration": 3.021143913269043 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046697, "balance_loss_mlp": 1.03788161, "diversity_loss_mlp": 0.0, "epoch": 0.9790303963062716, "flos": 512717133312.0, "grad_norm": 0.0707462132351249, "language_loss": 0.83914399, "learning_rate": 1.1527101699371767e-06, "loss": 0.84961092, "num_input_tokens_seen": 421617056, "router_z_loss_mlp": 0.08813477, "routerloss_mlp": 0.0, "step": 5089, "time_per_iteration": 2.6006627082824707 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048629, "balance_loss_mlp": 1.03978372, "diversity_loss_mlp": 0.0, "epoch": 0.9792227779915352, "flos": 494428147200.0, "grad_norm": 0.07258951215182398, "language_loss": 0.86249393, "learning_rate": 1.1316644914364237e-06, "loss": 0.87298024, "num_input_tokens_seen": 421683424, "router_z_loss_mlp": 0.08850098, "routerloss_mlp": 0.0, "step": 5090, "time_per_iteration": 2.57961106300354 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104929, "balance_loss_mlp": 1.04054606, "diversity_loss_mlp": 0.0, "epoch": 0.9794151596767988, "flos": 608325562368.0, "grad_norm": 0.0687229233050849, "language_loss": 0.81661642, "learning_rate": 1.1108124908000838e-06, "loss": 0.82710934, "num_input_tokens_seen": 421761200, "router_z_loss_mlp": 0.08758545, "routerloss_mlp": 0.0, "step": 5091, "time_per_iteration": 2.774179458618164 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043168, "balance_loss_mlp": 1.03394735, "diversity_loss_mlp": 0.0, "epoch": 0.9796075413620623, "flos": 478222009344.0, "grad_norm": 0.06774886047931106, "language_loss": 0.86759937, "learning_rate": 1.09015417612357e-06, "loss": 0.87803102, "num_input_tokens_seen": 421829600, "router_z_loss_mlp": 0.09222412, "routerloss_mlp": 0.0, "step": 5092, "time_per_iteration": 2.5726425647735596 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044727, "balance_loss_mlp": 1.03592968, "diversity_loss_mlp": 0.0, "epoch": 0.9797999230473259, "flos": 592220740608.0, "grad_norm": 0.06986809662631227, "language_loss": 0.84486377, "learning_rate": 1.0696895554271335e-06, "loss": 0.85531104, "num_input_tokens_seen": 421904928, "router_z_loss_mlp": 0.0880127, "routerloss_mlp": 0.0, "step": 5093, "time_per_iteration": 2.734572649002075 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044433, "balance_loss_mlp": 1.03564167, "diversity_loss_mlp": 0.0, "epoch": 0.9799923047325895, "flos": 556381343232.0, "grad_norm": 0.06627525100654652, "language_loss": 0.8142283, "learning_rate": 1.049418636655919e-06, "loss": 0.82467258, "num_input_tokens_seen": 421989616, "router_z_loss_mlp": 0.08795166, "routerloss_mlp": 0.0, "step": 5094, "time_per_iteration": 2.901499032974243 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104922, "balance_loss_mlp": 1.04027307, "diversity_loss_mlp": 0.0, "epoch": 0.9801846864178531, "flos": 579456405504.0, "grad_norm": 0.05858269256579561, "language_loss": 0.84523547, "learning_rate": 1.0293414276797974e-06, "loss": 0.85572767, "num_input_tokens_seen": 422067088, "router_z_loss_mlp": 0.08953857, "routerloss_mlp": 0.0, "step": 5095, "time_per_iteration": 2.749011754989624 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048661, "balance_loss_mlp": 1.03991067, "diversity_loss_mlp": 0.0, "epoch": 0.9803770681031165, "flos": 515101358592.0, "grad_norm": 0.08054047976821545, "language_loss": 0.8013413, "learning_rate": 1.0094579362933677e-06, "loss": 0.81182784, "num_input_tokens_seen": 422141136, "router_z_loss_mlp": 0.08752441, "routerloss_mlp": 0.0, "step": 5096, "time_per_iteration": 2.6734437942504883 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048068, "balance_loss_mlp": 1.03946686, "diversity_loss_mlp": 0.0, "epoch": 0.9805694497883801, "flos": 566988678144.0, "grad_norm": 0.06350240490258963, "language_loss": 0.7813378, "learning_rate": 9.897681702160654e-07, "loss": 0.79181844, "num_input_tokens_seen": 422216400, "router_z_loss_mlp": 0.08605957, "routerloss_mlp": 0.0, "step": 5097, "time_per_iteration": 2.726039409637451 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047979, "balance_loss_mlp": 1.03928292, "diversity_loss_mlp": 0.0, "epoch": 0.9807618314736437, "flos": 479351167488.0, "grad_norm": 0.06123275422091068, "language_loss": 0.73776084, "learning_rate": 9.702721370922208e-07, "loss": 0.74824059, "num_input_tokens_seen": 422287664, "router_z_loss_mlp": 0.08709717, "routerloss_mlp": 0.0, "step": 5098, "time_per_iteration": 2.6765458583831787 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00779933, "balance_loss_mlp": 1.31478071, "diversity_loss_mlp": 0.22396225, "epoch": 0.9809542131589073, "flos": 545285053440.0, "grad_norm": 0.03778989641153832, "language_loss": 0.80182397, "learning_rate": 9.509698444908344e-07, "loss": 0.8096233, "num_input_tokens_seen": 422357552, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.0105617, "step": 5099, "time_per_iteration": 2.6399407386779785 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047076, "balance_loss_mlp": 1.03835607, "diversity_loss_mlp": 0.0, "epoch": 0.9811465948441709, "flos": 520843696128.0, "grad_norm": 0.0712325944726878, "language_loss": 0.79504228, "learning_rate": 9.318612999057452e-07, "loss": 0.80551302, "num_input_tokens_seen": 422425872, "router_z_loss_mlp": 0.08728027, "routerloss_mlp": 0.0, "step": 5100, "time_per_iteration": 2.605034351348877 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047574, "balance_loss_mlp": 1.03872824, "diversity_loss_mlp": 0.0, "epoch": 0.9813389765294344, "flos": 541282341888.0, "grad_norm": 0.07915756516451043, "language_loss": 0.80425239, "learning_rate": 9.129465107554635e-07, "loss": 0.81472808, "num_input_tokens_seen": 422495760, "router_z_loss_mlp": 0.08850098, "routerloss_mlp": 0.0, "step": 5101, "time_per_iteration": 2.653615713119507 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045606, "balance_loss_mlp": 1.03676605, "diversity_loss_mlp": 0.0, "epoch": 0.981531358214698, "flos": 567356866560.0, "grad_norm": 0.07121268040890673, "language_loss": 0.84309268, "learning_rate": 8.942254843834485e-07, "loss": 0.85354877, "num_input_tokens_seen": 422568112, "router_z_loss_mlp": 0.08837891, "routerloss_mlp": 0.0, "step": 5102, "time_per_iteration": 2.7331223487854004 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048136, "balance_loss_mlp": 1.03933203, "diversity_loss_mlp": 0.0, "epoch": 0.9817237398999615, "flos": 577272241152.0, "grad_norm": 0.06082212845964829, "language_loss": 0.80932826, "learning_rate": 8.756982280578307e-07, "loss": 0.81980968, "num_input_tokens_seen": 422641280, "router_z_loss_mlp": 0.0881958, "routerloss_mlp": 0.0, "step": 5103, "time_per_iteration": 2.731088876724243 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047414, "balance_loss_mlp": 1.03868246, "diversity_loss_mlp": 0.0, "epoch": 0.9819161215852251, "flos": 701507547648.0, "grad_norm": 0.06577153639103081, "language_loss": 0.82189977, "learning_rate": 8.573647489714676e-07, "loss": 0.83237398, "num_input_tokens_seen": 422720416, "router_z_loss_mlp": 0.08740234, "routerloss_mlp": 0.0, "step": 5104, "time_per_iteration": 2.952533721923828 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047188, "balance_loss_mlp": 1.03831923, "diversity_loss_mlp": 0.0, "epoch": 0.9821085032704886, "flos": 624188104704.0, "grad_norm": 0.06798431241240387, "language_loss": 0.84167528, "learning_rate": 8.392250542421653e-07, "loss": 0.85214722, "num_input_tokens_seen": 422800384, "router_z_loss_mlp": 0.08880615, "routerloss_mlp": 0.0, "step": 5105, "time_per_iteration": 2.86313533782959 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044021, "balance_loss_mlp": 1.03541374, "diversity_loss_mlp": 0.0, "epoch": 0.9823008849557522, "flos": 499505688576.0, "grad_norm": 0.06686184516115971, "language_loss": 0.81452221, "learning_rate": 8.212791509122353e-07, "loss": 0.82496238, "num_input_tokens_seen": 422870768, "router_z_loss_mlp": 0.08612061, "routerloss_mlp": 0.0, "step": 5106, "time_per_iteration": 2.708230495452881 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045424, "balance_loss_mlp": 1.03651953, "diversity_loss_mlp": 0.0, "epoch": 0.9824932666410158, "flos": 523815994368.0, "grad_norm": 0.07713140113072105, "language_loss": 0.72798324, "learning_rate": 8.035270459489929e-07, "loss": 0.73843747, "num_input_tokens_seen": 422942864, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 5107, "time_per_iteration": 2.6602892875671387 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047946, "balance_loss_mlp": 1.0389818, "diversity_loss_mlp": 0.0, "epoch": 0.9826856483262794, "flos": 502663366656.0, "grad_norm": 0.06073968757615098, "language_loss": 0.82624412, "learning_rate": 7.859687462443698e-07, "loss": 0.83672357, "num_input_tokens_seen": 423013600, "router_z_loss_mlp": 0.08966064, "routerloss_mlp": 0.0, "step": 5108, "time_per_iteration": 2.637178421020508 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046295, "balance_loss_mlp": 1.03743768, "diversity_loss_mlp": 0.0, "epoch": 0.982878030011543, "flos": 562056869376.0, "grad_norm": 0.05986915063822493, "language_loss": 0.84416521, "learning_rate": 7.686042586151354e-07, "loss": 0.85462821, "num_input_tokens_seen": 423093680, "router_z_loss_mlp": 0.08862305, "routerloss_mlp": 0.0, "step": 5109, "time_per_iteration": 2.827469825744629 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046109, "balance_loss_mlp": 1.03744864, "diversity_loss_mlp": 0.0, "epoch": 0.9830704116968064, "flos": 537101591040.0, "grad_norm": 0.05962385879994031, "language_loss": 0.82830834, "learning_rate": 7.514335898027857e-07, "loss": 0.83876944, "num_input_tokens_seen": 423168608, "router_z_loss_mlp": 0.08666992, "routerloss_mlp": 0.0, "step": 5110, "time_per_iteration": 2.7789480686187744 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052062, "balance_loss_mlp": 1.0431633, "diversity_loss_mlp": 0.0, "epoch": 0.98326279338207, "flos": 458949597696.0, "grad_norm": 0.08038091049338392, "language_loss": 0.84353125, "learning_rate": 7.344567464735441e-07, "loss": 0.85405189, "num_input_tokens_seen": 423233552, "router_z_loss_mlp": 0.08905029, "routerloss_mlp": 0.0, "step": 5111, "time_per_iteration": 2.504210948944092 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046838, "balance_loss_mlp": 1.03787303, "diversity_loss_mlp": 0.0, "epoch": 0.9834551750673336, "flos": 640974974976.0, "grad_norm": 0.06156712151194387, "language_loss": 0.79174638, "learning_rate": 7.17673735218416e-07, "loss": 0.80221474, "num_input_tokens_seen": 423307440, "router_z_loss_mlp": 0.08972168, "routerloss_mlp": 0.0, "step": 5112, "time_per_iteration": 2.8035426139831543 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045207, "balance_loss_mlp": 1.03661203, "diversity_loss_mlp": 0.0, "epoch": 0.9836475567525972, "flos": 1071807220224.0, "grad_norm": 0.062084580460965294, "language_loss": 0.7939449, "learning_rate": 7.010845625530782e-07, "loss": 0.80439693, "num_input_tokens_seen": 423394880, "router_z_loss_mlp": 0.08605957, "routerloss_mlp": 0.0, "step": 5113, "time_per_iteration": 3.4046199321746826 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051509, "balance_loss_mlp": 1.04283631, "diversity_loss_mlp": 0.0, "epoch": 0.9838399384378607, "flos": 565209778176.0, "grad_norm": 0.08317258429297145, "language_loss": 0.76198953, "learning_rate": 6.846892349181566e-07, "loss": 0.77250463, "num_input_tokens_seen": 423461792, "router_z_loss_mlp": 0.08685303, "routerloss_mlp": 0.0, "step": 5114, "time_per_iteration": 2.668950319290161 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050737, "balance_loss_mlp": 1.04192185, "diversity_loss_mlp": 0.0, "epoch": 0.9840323201231242, "flos": 772805670912.0, "grad_norm": 0.07567501347544295, "language_loss": 0.79288757, "learning_rate": 6.684877586787819e-07, "loss": 0.80339497, "num_input_tokens_seen": 423539952, "router_z_loss_mlp": 0.0881958, "routerloss_mlp": 0.0, "step": 5115, "time_per_iteration": 2.9638354778289795 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046769, "balance_loss_mlp": 1.03803074, "diversity_loss_mlp": 0.0, "epoch": 0.9842247018083878, "flos": 472262358528.0, "grad_norm": 0.07643720957533141, "language_loss": 0.85790366, "learning_rate": 6.524801401249225e-07, "loss": 0.86837137, "num_input_tokens_seen": 423607184, "router_z_loss_mlp": 0.08746338, "routerloss_mlp": 0.0, "step": 5116, "time_per_iteration": 2.5682291984558105 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048242, "balance_loss_mlp": 1.03958189, "diversity_loss_mlp": 0.0, "epoch": 0.9844170834936514, "flos": 525259012608.0, "grad_norm": 0.07092299014904967, "language_loss": 0.84942091, "learning_rate": 6.366663854713295e-07, "loss": 0.85990334, "num_input_tokens_seen": 423676528, "router_z_loss_mlp": 0.08673096, "routerloss_mlp": 0.0, "step": 5117, "time_per_iteration": 2.637977123260498 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01003203, "balance_loss_mlp": 0.99879217, "diversity_loss_mlp": 0.0, "epoch": 0.984609465178915, "flos": 1567247408640.0, "grad_norm": 0.004507137876237267, "language_loss": 0.77162516, "learning_rate": 6.210465008574251e-07, "loss": 0.78165722, "num_input_tokens_seen": 423905856, "router_z_loss_mlp": 0.04418945, "routerloss_mlp": 0.0, "step": 5118, "time_per_iteration": 4.920542001724243 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01052333, "balance_loss_mlp": 1.04354155, "diversity_loss_mlp": 0.0, "epoch": 0.9848018468641785, "flos": 519548981760.0, "grad_norm": 0.07669150259725825, "language_loss": 0.82077813, "learning_rate": 6.056204923473584e-07, "loss": 0.83130145, "num_input_tokens_seen": 423972496, "router_z_loss_mlp": 0.0880127, "routerloss_mlp": 0.0, "step": 5119, "time_per_iteration": 2.606952428817749 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047086, "balance_loss_mlp": 1.03820455, "diversity_loss_mlp": 0.0, "epoch": 0.9849942285494421, "flos": 493004952576.0, "grad_norm": 0.061362579804974775, "language_loss": 0.83024836, "learning_rate": 5.903883659301167e-07, "loss": 0.84071916, "num_input_tokens_seen": 424039968, "router_z_loss_mlp": 0.08886719, "routerloss_mlp": 0.0, "step": 5120, "time_per_iteration": 2.588484525680542 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051257, "balance_loss_mlp": 1.04235184, "diversity_loss_mlp": 0.0, "epoch": 0.9851866102347057, "flos": 546001606656.0, "grad_norm": 0.0845871079135169, "language_loss": 0.81128502, "learning_rate": 5.753501275193029e-07, "loss": 0.82179761, "num_input_tokens_seen": 424108096, "router_z_loss_mlp": 0.08905029, "routerloss_mlp": 0.0, "step": 5121, "time_per_iteration": 2.6300275325775146 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044388, "balance_loss_mlp": 1.03557277, "diversity_loss_mlp": 0.0, "epoch": 0.9853789919199692, "flos": 476257729536.0, "grad_norm": 0.07512722548004026, "language_loss": 0.80214739, "learning_rate": 5.605057829531912e-07, "loss": 0.81259131, "num_input_tokens_seen": 424172256, "router_z_loss_mlp": 0.0881958, "routerloss_mlp": 0.0, "step": 5122, "time_per_iteration": 2.528691053390503 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051018, "balance_loss_mlp": 1.04198194, "diversity_loss_mlp": 0.0, "epoch": 0.9855713736052328, "flos": 1032619995648.0, "grad_norm": 0.1156037342387967, "language_loss": 0.76233137, "learning_rate": 5.458553379950049e-07, "loss": 0.77284151, "num_input_tokens_seen": 424261088, "router_z_loss_mlp": 0.09033203, "routerloss_mlp": 0.0, "step": 5123, "time_per_iteration": 3.356245517730713 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048832, "balance_loss_mlp": 1.04011154, "diversity_loss_mlp": 0.0, "epoch": 0.9857637552904963, "flos": 495050724864.0, "grad_norm": 0.0641282180922578, "language_loss": 0.82703745, "learning_rate": 5.31398798332472e-07, "loss": 0.83752573, "num_input_tokens_seen": 424329168, "router_z_loss_mlp": 0.08721924, "routerloss_mlp": 0.0, "step": 5124, "time_per_iteration": 2.625892400741577 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051607, "balance_loss_mlp": 1.04285097, "diversity_loss_mlp": 0.0, "epoch": 0.9859561369757599, "flos": 592267728384.0, "grad_norm": 0.06640628679407225, "language_loss": 0.8357659, "learning_rate": 5.17136169578103e-07, "loss": 0.84628195, "num_input_tokens_seen": 424399392, "router_z_loss_mlp": 0.08758545, "routerloss_mlp": 0.0, "step": 5125, "time_per_iteration": 2.6943421363830566 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045914, "balance_loss_mlp": 1.03727758, "diversity_loss_mlp": 0.0, "epoch": 0.9861485186610235, "flos": 486971149824.0, "grad_norm": 0.07733437230097125, "language_loss": 0.78536099, "learning_rate": 5.030674572691907e-07, "loss": 0.79582012, "num_input_tokens_seen": 424470080, "router_z_loss_mlp": 0.08648682, "routerloss_mlp": 0.0, "step": 5126, "time_per_iteration": 2.663972854614258 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047998, "balance_loss_mlp": 1.03925443, "diversity_loss_mlp": 0.0, "epoch": 0.9863409003462871, "flos": 518795352576.0, "grad_norm": 0.06032739387712679, "language_loss": 0.82490909, "learning_rate": 4.891926668676994e-07, "loss": 0.83538908, "num_input_tokens_seen": 424541824, "router_z_loss_mlp": 0.08758545, "routerloss_mlp": 0.0, "step": 5127, "time_per_iteration": 2.6729202270507812 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01003205, "balance_loss_mlp": 0.99879479, "diversity_loss_mlp": 0.0, "epoch": 0.9865332820315506, "flos": 1486026570240.0, "grad_norm": 0.004506363295624896, "language_loss": 0.79182732, "learning_rate": 4.755118037602646e-07, "loss": 0.80185938, "num_input_tokens_seen": 424773408, "router_z_loss_mlp": 0.04418945, "routerloss_mlp": 0.0, "step": 5128, "time_per_iteration": 4.911416530609131 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0078477, "balance_loss_mlp": 1.32232308, "diversity_loss_mlp": 0.22574797, "epoch": 0.9867256637168141, "flos": 582112645632.0, "grad_norm": 0.03417894522546616, "language_loss": 0.79182434, "learning_rate": 4.620248732582488e-07, "loss": 0.79967207, "num_input_tokens_seen": 424840608, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01073482, "step": 5129, "time_per_iteration": 2.7484471797943115 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0077241, "balance_loss_mlp": 1.299196, "diversity_loss_mlp": 0.22459432, "epoch": 0.9869180454020777, "flos": 959303264256.0, "grad_norm": 0.0327459890880189, "language_loss": 0.86703897, "learning_rate": 4.487318805977969e-07, "loss": 0.87476307, "num_input_tokens_seen": 424926128, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01051447, "step": 5130, "time_per_iteration": 3.2471301555633545 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048775, "balance_loss_mlp": 1.03996539, "diversity_loss_mlp": 0.0, "epoch": 0.9871104270873413, "flos": 770730163200.0, "grad_norm": 0.07462217297713208, "language_loss": 0.82822615, "learning_rate": 4.3563283093966954e-07, "loss": 0.83871394, "num_input_tokens_seen": 425005744, "router_z_loss_mlp": 0.0881958, "routerloss_mlp": 0.0, "step": 5131, "time_per_iteration": 3.0264713764190674 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044797, "balance_loss_mlp": 1.03579676, "diversity_loss_mlp": 0.0, "epoch": 0.9873028087726049, "flos": 446444794368.0, "grad_norm": 0.09684750541354396, "language_loss": 0.78034192, "learning_rate": 4.2272772936940986e-07, "loss": 0.7907899, "num_input_tokens_seen": 425068112, "router_z_loss_mlp": 0.08996582, "routerloss_mlp": 0.0, "step": 5132, "time_per_iteration": 2.501401662826538 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047696, "balance_loss_mlp": 1.03900599, "diversity_loss_mlp": 0.0, "epoch": 0.9874951904578684, "flos": 507612427776.0, "grad_norm": 0.06608816794625222, "language_loss": 0.86122322, "learning_rate": 4.1001658089717676e-07, "loss": 0.87170017, "num_input_tokens_seen": 425137408, "router_z_loss_mlp": 0.08691406, "routerloss_mlp": 0.0, "step": 5133, "time_per_iteration": 2.595851421356201 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046161, "balance_loss_mlp": 1.03731585, "diversity_loss_mlp": 0.0, "epoch": 0.987687572143132, "flos": 716742743040.0, "grad_norm": 0.07376071696211185, "language_loss": 0.81970578, "learning_rate": 3.9749939045791164e-07, "loss": 0.83016741, "num_input_tokens_seen": 425213504, "router_z_loss_mlp": 0.08856201, "routerloss_mlp": 0.0, "step": 5134, "time_per_iteration": 2.899350881576538 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01003204, "balance_loss_mlp": 0.99879336, "diversity_loss_mlp": 0.0, "epoch": 0.9878799538283956, "flos": 1538647695360.0, "grad_norm": 0.004506854986446618, "language_loss": 0.79817951, "learning_rate": 3.851761629111716e-07, "loss": 0.80821157, "num_input_tokens_seen": 425451296, "router_z_loss_mlp": 0.04418945, "routerloss_mlp": 0.0, "step": 5135, "time_per_iteration": 4.867925405502319 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050252, "balance_loss_mlp": 1.04155612, "diversity_loss_mlp": 0.0, "epoch": 0.9880723355136591, "flos": 721424931840.0, "grad_norm": 0.06071682459398163, "language_loss": 0.81917751, "learning_rate": 3.730469030412964e-07, "loss": 0.82968003, "num_input_tokens_seen": 425527536, "router_z_loss_mlp": 0.0869751, "routerloss_mlp": 0.0, "step": 5136, "time_per_iteration": 2.9082465171813965 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00784556, "balance_loss_mlp": 1.3212409, "diversity_loss_mlp": 0.22676432, "epoch": 0.9882647171989226, "flos": 557350087680.0, "grad_norm": 0.028741736801368708, "language_loss": 0.84462202, "learning_rate": 3.611116155572969e-07, "loss": 0.8524676, "num_input_tokens_seen": 425596608, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01055351, "step": 5137, "time_per_iteration": 2.687598705291748 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048005, "balance_loss_mlp": 1.03901052, "diversity_loss_mlp": 0.0, "epoch": 0.9884570988841862, "flos": 562820410368.0, "grad_norm": 0.07713102005937741, "language_loss": 0.80440414, "learning_rate": 3.493703050927999e-07, "loss": 0.81488419, "num_input_tokens_seen": 425667280, "router_z_loss_mlp": 0.09002686, "routerloss_mlp": 0.0, "step": 5138, "time_per_iteration": 2.7116920948028564 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046658, "balance_loss_mlp": 1.03775895, "diversity_loss_mlp": 0.0, "epoch": 0.9886494805694498, "flos": 431763167232.0, "grad_norm": 0.07051878557324726, "language_loss": 0.86536169, "learning_rate": 3.378229762062146e-07, "loss": 0.87582827, "num_input_tokens_seen": 425730736, "router_z_loss_mlp": 0.08905029, "routerloss_mlp": 0.0, "step": 5139, "time_per_iteration": 2.477654218673706 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045545, "balance_loss_mlp": 1.03696823, "diversity_loss_mlp": 0.0, "epoch": 0.9888418622547134, "flos": 592082348544.0, "grad_norm": 0.05631423705134008, "language_loss": 0.90553308, "learning_rate": 3.264696333806771e-07, "loss": 0.9159885, "num_input_tokens_seen": 425807616, "router_z_loss_mlp": 0.08587646, "routerloss_mlp": 0.0, "step": 5140, "time_per_iteration": 2.789351224899292 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049588, "balance_loss_mlp": 1.04073703, "diversity_loss_mlp": 0.0, "epoch": 0.989034243939977, "flos": 1134993461760.0, "grad_norm": 0.06262136237267299, "language_loss": 0.80186951, "learning_rate": 3.1531028102388394e-07, "loss": 0.81236541, "num_input_tokens_seen": 425900880, "router_z_loss_mlp": 0.08862305, "routerloss_mlp": 0.0, "step": 5141, "time_per_iteration": 3.521420478820801 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104667, "balance_loss_mlp": 1.03778934, "diversity_loss_mlp": 0.0, "epoch": 0.9892266256252404, "flos": 566670048768.0, "grad_norm": 0.0653214866342138, "language_loss": 0.81865728, "learning_rate": 3.0434492346825824e-07, "loss": 0.82912397, "num_input_tokens_seen": 425973632, "router_z_loss_mlp": 0.08898926, "routerloss_mlp": 0.0, "step": 5142, "time_per_iteration": 2.6905152797698975 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046574, "balance_loss_mlp": 1.03794312, "diversity_loss_mlp": 0.0, "epoch": 0.989419007310504, "flos": 640577051136.0, "grad_norm": 0.06437869536727725, "language_loss": 0.83950132, "learning_rate": 2.9357356497095033e-07, "loss": 0.84996706, "num_input_tokens_seen": 426057088, "router_z_loss_mlp": 0.08636475, "routerloss_mlp": 0.0, "step": 5143, "time_per_iteration": 2.9280619621276855 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00775776, "balance_loss_mlp": 1.30826199, "diversity_loss_mlp": 0.22223487, "epoch": 0.9896113889957676, "flos": 455478059520.0, "grad_norm": 0.03094231827555858, "language_loss": 0.81775147, "learning_rate": 2.829962097138372e-07, "loss": 0.82550919, "num_input_tokens_seen": 426124336, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01052798, "step": 5144, "time_per_iteration": 2.6317298412323 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046559, "balance_loss_mlp": 1.03809488, "diversity_loss_mlp": 0.0, "epoch": 0.9898037706810312, "flos": 567339614208.0, "grad_norm": 0.06731066884585553, "language_loss": 0.80676913, "learning_rate": 2.726128618033008e-07, "loss": 0.81723469, "num_input_tokens_seen": 426191888, "router_z_loss_mlp": 0.08465576, "routerloss_mlp": 0.0, "step": 5145, "time_per_iteration": 2.6584229469299316 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00588607, "balance_loss_mlp": 1.02718186, "diversity_loss_mlp": 0.13146883, "epoch": 0.9899961523662947, "flos": 1550268191232.0, "grad_norm": 0.0012619225721446723, "language_loss": 0.78146422, "learning_rate": 2.624235252706164e-07, "loss": 0.7873503, "num_input_tokens_seen": 426425840, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.00928183, "step": 5146, "time_per_iteration": 4.944198369979858 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046886, "balance_loss_mlp": 1.03796947, "diversity_loss_mlp": 0.0, "epoch": 0.9901885340515583, "flos": 610709787648.0, "grad_norm": 0.06397137457157225, "language_loss": 0.85200578, "learning_rate": 2.524282040715642e-07, "loss": 0.86247468, "num_input_tokens_seen": 426506080, "router_z_loss_mlp": 0.0892334, "routerloss_mlp": 0.0, "step": 5147, "time_per_iteration": 2.920581579208374 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045844, "balance_loss_mlp": 1.03715396, "diversity_loss_mlp": 0.0, "epoch": 0.9903809157368219, "flos": 517483385856.0, "grad_norm": 0.06276990657159663, "language_loss": 0.82674694, "learning_rate": 2.426269020866512e-07, "loss": 0.83720535, "num_input_tokens_seen": 426573936, "router_z_loss_mlp": 0.08703613, "routerloss_mlp": 0.0, "step": 5148, "time_per_iteration": 2.5547163486480713 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047255, "balance_loss_mlp": 1.0385884, "diversity_loss_mlp": 0.0, "epoch": 0.9905732974220854, "flos": 1100426757120.0, "grad_norm": 0.06810375608375513, "language_loss": 0.80711174, "learning_rate": 2.3301962312122226e-07, "loss": 0.81758434, "num_input_tokens_seen": 426657472, "router_z_loss_mlp": 0.08666992, "routerloss_mlp": 0.0, "step": 5149, "time_per_iteration": 3.4215774536132812 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045945, "balance_loss_mlp": 1.03725505, "diversity_loss_mlp": 0.0, "epoch": 0.990765679107349, "flos": 858002056704.0, "grad_norm": 0.08140595339599294, "language_loss": 0.84472948, "learning_rate": 2.2360637090496073e-07, "loss": 0.85518897, "num_input_tokens_seen": 426740560, "router_z_loss_mlp": 0.0869751, "routerloss_mlp": 0.0, "step": 5150, "time_per_iteration": 3.104238271713257 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046497, "balance_loss_mlp": 1.03784227, "diversity_loss_mlp": 0.0, "epoch": 0.9909580607926125, "flos": 491287721472.0, "grad_norm": 0.07994567324384995, "language_loss": 0.80567187, "learning_rate": 2.143871490925542e-07, "loss": 0.81613684, "num_input_tokens_seen": 426809296, "router_z_loss_mlp": 0.08666992, "routerloss_mlp": 0.0, "step": 5151, "time_per_iteration": 2.597073554992676 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046205, "balance_loss_mlp": 1.03738976, "diversity_loss_mlp": 0.0, "epoch": 0.9911504424778761, "flos": 585060350976.0, "grad_norm": 0.0788095686937427, "language_loss": 0.79632246, "learning_rate": 2.0536196126319519e-07, "loss": 0.80678451, "num_input_tokens_seen": 426881056, "router_z_loss_mlp": 0.08825684, "routerloss_mlp": 0.0, "step": 5152, "time_per_iteration": 2.672553062438965 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104469, "balance_loss_mlp": 1.03561211, "diversity_loss_mlp": 0.0, "epoch": 0.9913428241631397, "flos": 570030359040.0, "grad_norm": 0.06752430275446872, "language_loss": 0.81667304, "learning_rate": 1.9653081092074753e-07, "loss": 0.82711995, "num_input_tokens_seen": 426949664, "router_z_loss_mlp": 0.09088135, "routerloss_mlp": 0.0, "step": 5153, "time_per_iteration": 2.6830427646636963 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047286, "balance_loss_mlp": 1.03867936, "diversity_loss_mlp": 0.0, "epoch": 0.9915352058484033, "flos": 489745958400.0, "grad_norm": 0.06636262173491685, "language_loss": 0.86006486, "learning_rate": 1.8789370149374652e-07, "loss": 0.8705377, "num_input_tokens_seen": 427018816, "router_z_loss_mlp": 0.08618164, "routerloss_mlp": 0.0, "step": 5154, "time_per_iteration": 2.6368730068206787 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104807, "balance_loss_mlp": 1.03920066, "diversity_loss_mlp": 0.0, "epoch": 0.9917275875336667, "flos": 744047741952.0, "grad_norm": 0.060555053830850476, "language_loss": 0.82984126, "learning_rate": 1.7945063633545423e-07, "loss": 0.84032202, "num_input_tokens_seen": 427097984, "router_z_loss_mlp": 0.08874512, "routerloss_mlp": 0.0, "step": 5155, "time_per_iteration": 2.989109754562378 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01043535, "balance_loss_mlp": 1.03463578, "diversity_loss_mlp": 0.0, "epoch": 0.9919199692189303, "flos": 508272081408.0, "grad_norm": 0.06288570543658592, "language_loss": 0.80066729, "learning_rate": 1.7120161872380412e-07, "loss": 0.81110263, "num_input_tokens_seen": 427169280, "router_z_loss_mlp": 0.08898926, "routerloss_mlp": 0.0, "step": 5156, "time_per_iteration": 2.6498100757598877 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045724, "balance_loss_mlp": 1.03691423, "diversity_loss_mlp": 0.0, "epoch": 0.9921123509041939, "flos": 543963174912.0, "grad_norm": 0.06594459780967553, "language_loss": 0.84395134, "learning_rate": 1.6314665186123457e-07, "loss": 0.85440862, "num_input_tokens_seen": 427237312, "router_z_loss_mlp": 0.08825684, "routerloss_mlp": 0.0, "step": 5157, "time_per_iteration": 2.6490535736083984 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045394, "balance_loss_mlp": 1.03646517, "diversity_loss_mlp": 0.0, "epoch": 0.9923047325894575, "flos": 671561362944.0, "grad_norm": 0.06545947039571581, "language_loss": 0.77654356, "learning_rate": 1.5528573887507724e-07, "loss": 0.78699744, "num_input_tokens_seen": 427305008, "router_z_loss_mlp": 0.08929443, "routerloss_mlp": 0.0, "step": 5158, "time_per_iteration": 2.7639706134796143 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047617, "balance_loss_mlp": 1.03868222, "diversity_loss_mlp": 0.0, "epoch": 0.9924971142747211, "flos": 466557096960.0, "grad_norm": 0.06168897901648668, "language_loss": 0.8080498, "learning_rate": 1.4761888281711322e-07, "loss": 0.81852591, "num_input_tokens_seen": 427377008, "router_z_loss_mlp": 0.08935547, "routerloss_mlp": 0.0, "step": 5159, "time_per_iteration": 2.7385036945343018 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049861, "balance_loss_mlp": 1.04073584, "diversity_loss_mlp": 0.0, "epoch": 0.9926894959599846, "flos": 491581757952.0, "grad_norm": 0.06899221386615825, "language_loss": 0.82835615, "learning_rate": 1.4014608666390594e-07, "loss": 0.83885473, "num_input_tokens_seen": 427444528, "router_z_loss_mlp": 0.09124756, "routerloss_mlp": 0.0, "step": 5160, "time_per_iteration": 2.559859037399292 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050661, "balance_loss_mlp": 1.04187524, "diversity_loss_mlp": 0.0, "epoch": 0.9928818776452482, "flos": 492389715456.0, "grad_norm": 0.08668343737324606, "language_loss": 0.81916565, "learning_rate": 1.328673533166902e-07, "loss": 0.82967234, "num_input_tokens_seen": 427509808, "router_z_loss_mlp": 0.0880127, "routerloss_mlp": 0.0, "step": 5161, "time_per_iteration": 2.5678670406341553 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048471, "balance_loss_mlp": 1.03970289, "diversity_loss_mlp": 0.0, "epoch": 0.9930742593305117, "flos": 546357312000.0, "grad_norm": 0.06843444651252836, "language_loss": 0.84165454, "learning_rate": 1.2578268560131666e-07, "loss": 0.85213923, "num_input_tokens_seen": 427587936, "router_z_loss_mlp": 0.08782959, "routerloss_mlp": 0.0, "step": 5162, "time_per_iteration": 2.7581584453582764 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047155, "balance_loss_mlp": 1.03851247, "diversity_loss_mlp": 0.0, "epoch": 0.9932666410157753, "flos": 585510031872.0, "grad_norm": 0.06263196001846472, "language_loss": 0.85711837, "learning_rate": 1.1889208626825188e-07, "loss": 0.86758995, "num_input_tokens_seen": 427662224, "router_z_loss_mlp": 0.08648682, "routerloss_mlp": 0.0, "step": 5163, "time_per_iteration": 2.7846977710723877 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046625, "balance_loss_mlp": 1.0378511, "diversity_loss_mlp": 0.0, "epoch": 0.9934590227010388, "flos": 537086909952.0, "grad_norm": 0.06164233206359557, "language_loss": 0.83855546, "learning_rate": 1.1219555799268921e-07, "loss": 0.84902167, "num_input_tokens_seen": 427730544, "router_z_loss_mlp": 0.08782959, "routerloss_mlp": 0.0, "step": 5164, "time_per_iteration": 2.716646671295166 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047268, "balance_loss_mlp": 1.03856587, "diversity_loss_mlp": 0.0, "epoch": 0.9936514043863024, "flos": 518014559232.0, "grad_norm": 0.06133860998625567, "language_loss": 0.86944854, "learning_rate": 1.0569310337443794e-07, "loss": 0.8799212, "num_input_tokens_seen": 427799760, "router_z_loss_mlp": 0.08703613, "routerloss_mlp": 0.0, "step": 5165, "time_per_iteration": 2.614095687866211 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104736, "balance_loss_mlp": 1.03866947, "diversity_loss_mlp": 0.0, "epoch": 0.993843786071566, "flos": 744625903104.0, "grad_norm": 0.06754893239543082, "language_loss": 0.80281818, "learning_rate": 9.938472493803419e-08, "loss": 0.81329167, "num_input_tokens_seen": 427881936, "router_z_loss_mlp": 0.0869751, "routerloss_mlp": 0.0, "step": 5166, "time_per_iteration": 3.028465986251831 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01050745, "balance_loss_mlp": 1.04209042, "diversity_loss_mlp": 0.0, "epoch": 0.9940361677568296, "flos": 525918666240.0, "grad_norm": 0.06956871932384841, "language_loss": 0.82008004, "learning_rate": 9.327042513251893e-08, "loss": 0.83058745, "num_input_tokens_seen": 427951648, "router_z_loss_mlp": 0.08666992, "routerloss_mlp": 0.0, "step": 5167, "time_per_iteration": 2.698882818222046 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104561, "balance_loss_mlp": 1.03672278, "diversity_loss_mlp": 0.0, "epoch": 0.9942285494420932, "flos": 555650108928.0, "grad_norm": 0.06410012888366921, "language_loss": 0.80157578, "learning_rate": 8.735020633177104e-08, "loss": 0.81203187, "num_input_tokens_seen": 428031184, "router_z_loss_mlp": 0.08898926, "routerloss_mlp": 0.0, "step": 5168, "time_per_iteration": 2.7812376022338867 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046457, "balance_loss_mlp": 1.0377903, "diversity_loss_mlp": 0.0, "epoch": 0.9944209311273566, "flos": 585996788736.0, "grad_norm": 0.06620347908149736, "language_loss": 0.82235384, "learning_rate": 8.162407083411872e-08, "loss": 0.83281839, "num_input_tokens_seen": 428107296, "router_z_loss_mlp": 0.08673096, "routerloss_mlp": 0.0, "step": 5169, "time_per_iteration": 2.7237818241119385 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047633, "balance_loss_mlp": 1.0389545, "diversity_loss_mlp": 0.0, "epoch": 0.9946133128126202, "flos": 735518486016.0, "grad_norm": 0.06912708749251066, "language_loss": 0.82253057, "learning_rate": 7.609202086272804e-08, "loss": 0.83300692, "num_input_tokens_seen": 428187904, "router_z_loss_mlp": 0.08691406, "routerloss_mlp": 0.0, "step": 5170, "time_per_iteration": 2.9818952083587646 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047087, "balance_loss_mlp": 1.03824186, "diversity_loss_mlp": 0.0, "epoch": 0.9948056944978838, "flos": 646018011648.0, "grad_norm": 0.08243647739411311, "language_loss": 0.82281691, "learning_rate": 7.075405856526995e-08, "loss": 0.83328784, "num_input_tokens_seen": 428255856, "router_z_loss_mlp": 0.08850098, "routerloss_mlp": 0.0, "step": 5171, "time_per_iteration": 2.7422502040863037 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104891, "balance_loss_mlp": 1.04017246, "diversity_loss_mlp": 0.0, "epoch": 0.9949980761831474, "flos": 445846809600.0, "grad_norm": 0.06824796371814347, "language_loss": 0.86093032, "learning_rate": 6.561018601414226e-08, "loss": 0.87141943, "num_input_tokens_seen": 428321872, "router_z_loss_mlp": 0.08746338, "routerloss_mlp": 0.0, "step": 5172, "time_per_iteration": 2.51432728767395 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046756, "balance_loss_mlp": 1.03810704, "diversity_loss_mlp": 0.0, "epoch": 0.995190457868411, "flos": 435637398528.0, "grad_norm": 0.06509423598404523, "language_loss": 0.85527599, "learning_rate": 6.066040520641414e-08, "loss": 0.86574364, "num_input_tokens_seen": 428389232, "router_z_loss_mlp": 0.08654785, "routerloss_mlp": 0.0, "step": 5173, "time_per_iteration": 2.6191818714141846 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047522, "balance_loss_mlp": 1.0386107, "diversity_loss_mlp": 0.0, "epoch": 0.9953828395536745, "flos": 514187315712.0, "grad_norm": 0.06870476422803651, "language_loss": 0.81628877, "learning_rate": 5.590471806377062e-08, "loss": 0.82676393, "num_input_tokens_seen": 428456128, "router_z_loss_mlp": 0.08911133, "routerloss_mlp": 0.0, "step": 5174, "time_per_iteration": 2.569406270980835 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046917, "balance_loss_mlp": 1.03805971, "diversity_loss_mlp": 0.0, "epoch": 0.995575221238938, "flos": 479847836160.0, "grad_norm": 0.06879136838428648, "language_loss": 0.81909287, "learning_rate": 5.134312643245709e-08, "loss": 0.82956201, "num_input_tokens_seen": 428523504, "router_z_loss_mlp": 0.08862305, "routerloss_mlp": 0.0, "step": 5175, "time_per_iteration": 2.5882654190063477 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01049317, "balance_loss_mlp": 1.04018593, "diversity_loss_mlp": 0.0, "epoch": 0.9957676029242016, "flos": 587785600512.0, "grad_norm": 0.08802784581931292, "language_loss": 0.76484299, "learning_rate": 4.6975632083445793e-08, "loss": 0.77533621, "num_input_tokens_seen": 428596880, "router_z_loss_mlp": 0.09130859, "routerloss_mlp": 0.0, "step": 5176, "time_per_iteration": 2.7355172634124756 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00781269, "balance_loss_mlp": 1.31630397, "diversity_loss_mlp": 0.2250234, "epoch": 0.9959599846094652, "flos": 426465741312.0, "grad_norm": 0.03484461119289524, "language_loss": 0.80370349, "learning_rate": 4.280223671243588e-08, "loss": 0.81151617, "num_input_tokens_seen": 428659472, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01060532, "step": 5177, "time_per_iteration": 2.488933563232422 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045722, "balance_loss_mlp": 1.03673339, "diversity_loss_mlp": 0.0, "epoch": 0.9961523662947287, "flos": 611619061248.0, "grad_norm": 0.060646192988618466, "language_loss": 0.80473614, "learning_rate": 3.8822941939575804e-08, "loss": 0.81519341, "num_input_tokens_seen": 428736704, "router_z_loss_mlp": 0.08996582, "routerloss_mlp": 0.0, "step": 5178, "time_per_iteration": 2.860849380493164 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045766, "balance_loss_mlp": 1.03690243, "diversity_loss_mlp": 0.0, "epoch": 0.9963447479799923, "flos": 550785111552.0, "grad_norm": 0.06956117500096984, "language_loss": 0.73755258, "learning_rate": 3.5037749309851927e-08, "loss": 0.74801028, "num_input_tokens_seen": 428808560, "router_z_loss_mlp": 0.08868408, "routerloss_mlp": 0.0, "step": 5179, "time_per_iteration": 2.652787446975708 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01051578, "balance_loss_mlp": 1.04275656, "diversity_loss_mlp": 0.0, "epoch": 0.9965371296652559, "flos": 625873402368.0, "grad_norm": 0.081637230316847, "language_loss": 0.89049286, "learning_rate": 3.1446660292755446e-08, "loss": 0.90100861, "num_input_tokens_seen": 428880688, "router_z_loss_mlp": 0.08831787, "routerloss_mlp": 0.0, "step": 5180, "time_per_iteration": 2.7644760608673096 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048065, "balance_loss_mlp": 1.03896928, "diversity_loss_mlp": 0.0, "epoch": 0.9967295113505195, "flos": 639522044928.0, "grad_norm": 0.0759879935902396, "language_loss": 0.81941384, "learning_rate": 2.8049676282504433e-08, "loss": 0.82989448, "num_input_tokens_seen": 428960096, "router_z_loss_mlp": 0.09100342, "routerloss_mlp": 0.0, "step": 5181, "time_per_iteration": 2.9104771614074707 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046151, "balance_loss_mlp": 1.03727567, "diversity_loss_mlp": 0.0, "epoch": 0.996921893035783, "flos": 607389124608.0, "grad_norm": 0.0884261396290618, "language_loss": 0.76887906, "learning_rate": 2.484679859793282e-08, "loss": 0.77934057, "num_input_tokens_seen": 429031296, "router_z_loss_mlp": 0.08886719, "routerloss_mlp": 0.0, "step": 5182, "time_per_iteration": 2.721599578857422 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048224, "balance_loss_mlp": 1.03908062, "diversity_loss_mlp": 0.0, "epoch": 0.9971142747210465, "flos": 644162388480.0, "grad_norm": 0.0648988132762576, "language_loss": 0.81727201, "learning_rate": 2.183802848243488e-08, "loss": 0.82775426, "num_input_tokens_seen": 429103312, "router_z_loss_mlp": 0.09155273, "routerloss_mlp": 0.0, "step": 5183, "time_per_iteration": 2.7815635204315186 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048087, "balance_loss_mlp": 1.03952742, "diversity_loss_mlp": 0.0, "epoch": 0.9973066564063101, "flos": 1040773722624.0, "grad_norm": 0.05502432672300637, "language_loss": 0.81058741, "learning_rate": 1.9023367104187285e-08, "loss": 0.82106829, "num_input_tokens_seen": 429194896, "router_z_loss_mlp": 0.08569336, "routerloss_mlp": 0.0, "step": 5184, "time_per_iteration": 3.372502326965332 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0105351, "balance_loss_mlp": 1.04470634, "diversity_loss_mlp": 0.0, "epoch": 0.9974990380915737, "flos": 665095131648.0, "grad_norm": 0.08025246784684749, "language_loss": 0.83187962, "learning_rate": 1.640281555587153e-08, "loss": 0.84241462, "num_input_tokens_seen": 429267664, "router_z_loss_mlp": 0.08813477, "routerloss_mlp": 0.0, "step": 5185, "time_per_iteration": 2.835519313812256 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01047458, "balance_loss_mlp": 1.03881598, "diversity_loss_mlp": 0.0, "epoch": 0.9976914197768373, "flos": 718121521152.0, "grad_norm": 0.06904687845719167, "language_loss": 0.77359349, "learning_rate": 1.3976374855007024e-08, "loss": 0.78406811, "num_input_tokens_seen": 429343472, "router_z_loss_mlp": 0.08648682, "routerloss_mlp": 0.0, "step": 5186, "time_per_iteration": 2.8937785625457764 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048144, "balance_loss_mlp": 1.03904831, "diversity_loss_mlp": 0.0, "epoch": 0.9978838014621008, "flos": 518328419328.0, "grad_norm": 0.07280590001962838, "language_loss": 0.79471743, "learning_rate": 1.1744045943451464e-08, "loss": 0.80519885, "num_input_tokens_seen": 429411472, "router_z_loss_mlp": 0.09094238, "routerloss_mlp": 0.0, "step": 5187, "time_per_iteration": 2.635932207107544 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01044774, "balance_loss_mlp": 1.03606606, "diversity_loss_mlp": 0.0, "epoch": 0.9980761831473643, "flos": 603430829568.0, "grad_norm": 0.05359795749809877, "language_loss": 0.84325933, "learning_rate": 9.70582968801148e-09, "loss": 0.85370713, "num_input_tokens_seen": 429486704, "router_z_loss_mlp": 0.08721924, "routerloss_mlp": 0.0, "step": 5188, "time_per_iteration": 2.7615973949432373 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045401, "balance_loss_mlp": 1.03626382, "diversity_loss_mlp": 0.0, "epoch": 0.9982685648326279, "flos": 453523691520.0, "grad_norm": 0.0657633073490906, "language_loss": 0.8937813, "learning_rate": 7.861726879943021e-09, "loss": 0.9042353, "num_input_tokens_seen": 429554736, "router_z_loss_mlp": 0.09136963, "routerloss_mlp": 0.0, "step": 5189, "time_per_iteration": 2.543257236480713 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045558, "balance_loss_mlp": 1.03698051, "diversity_loss_mlp": 0.0, "epoch": 0.9984609465178915, "flos": 481424103936.0, "grad_norm": 0.0777283177143095, "language_loss": 0.78666133, "learning_rate": 6.211738235173403e-09, "loss": 0.79711688, "num_input_tokens_seen": 429623216, "router_z_loss_mlp": 0.08581543, "routerloss_mlp": 0.0, "step": 5190, "time_per_iteration": 2.6314117908477783 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.010468, "balance_loss_mlp": 1.03816903, "diversity_loss_mlp": 0.0, "epoch": 0.9986533282031551, "flos": 476941976064.0, "grad_norm": 0.05898093011437241, "language_loss": 0.84184742, "learning_rate": 4.755864394301312e-09, "loss": 0.85231537, "num_input_tokens_seen": 429695808, "router_z_loss_mlp": 0.08636475, "routerloss_mlp": 0.0, "step": 5191, "time_per_iteration": 2.6695079803466797 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.0104994, "balance_loss_mlp": 1.04094553, "diversity_loss_mlp": 0.0, "epoch": 0.9988457098884186, "flos": 641948488704.0, "grad_norm": 0.06405577435904004, "language_loss": 0.86847579, "learning_rate": 3.494105922541291e-09, "loss": 0.87897515, "num_input_tokens_seen": 429774464, "router_z_loss_mlp": 0.08996582, "routerloss_mlp": 0.0, "step": 5192, "time_per_iteration": 2.8024892807006836 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01046763, "balance_loss_mlp": 1.03778648, "diversity_loss_mlp": 0.0, "epoch": 0.9990380915736822, "flos": 396321693696.0, "grad_norm": 0.0686453524231272, "language_loss": 0.88108921, "learning_rate": 2.4264633097237365e-09, "loss": 0.89155686, "num_input_tokens_seen": 429835872, "router_z_loss_mlp": 0.08978271, "routerloss_mlp": 0.0, "step": 5193, "time_per_iteration": 2.4370131492614746 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01045344, "balance_loss_mlp": 1.0364393, "diversity_loss_mlp": 0.0, "epoch": 0.9992304732589458, "flos": 576123259392.0, "grad_norm": 0.06828670759326802, "language_loss": 0.85050082, "learning_rate": 1.552936970405927e-09, "loss": 0.86095428, "num_input_tokens_seen": 429911440, "router_z_loss_mlp": 0.08905029, "routerloss_mlp": 0.0, "step": 5194, "time_per_iteration": 2.765718698501587 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01048829, "balance_loss_mlp": 1.04024625, "diversity_loss_mlp": 0.0, "epoch": 0.9994228549442093, "flos": 544291716096.0, "grad_norm": 0.07220046609149769, "language_loss": 0.75592577, "learning_rate": 8.735272437054853e-10, "loss": 0.76641411, "num_input_tokens_seen": 429982512, "router_z_loss_mlp": 0.08587646, "routerloss_mlp": 0.0, "step": 5195, "time_per_iteration": 2.713330030441284 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.00790766, "balance_loss_mlp": 1.33585405, "diversity_loss_mlp": 0.22418211, "epoch": 0.9996152366294728, "flos": 1471314502656.0, "grad_norm": 0.03504416823087641, "language_loss": 0.81017089, "learning_rate": 3.882343933003796e-10, "loss": 0.81807852, "num_input_tokens_seen": 430070944, "router_z_loss_mlp": 0.0, "routerloss_mlp": 0.01074793, "step": 5196, "time_per_iteration": 3.730872631072998 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01036634, "balance_loss_mlp": 1.028754, "diversity_loss_mlp": 0.0, "epoch": 0.9998076183147364, "flos": 618950149632.0, "grad_norm": 0.09543829836144671, "language_loss": 0.69830346, "learning_rate": 9.70586077619906e-11, "loss": 0.70866984, "num_input_tokens_seen": 430164864, "router_z_loss_mlp": 0.0788269, "routerloss_mlp": 0.0, "step": 5197, "time_per_iteration": 4.026475429534912 }, { "auxiliary_loss_clip": 0.0, "auxiliary_loss_mlp": 0.01018596, "balance_loss_mlp": 1.01257348, "diversity_loss_mlp": 0.0, "epoch": 1.0, "flos": 1290737617920.0, "grad_norm": 0.032396730253084045, "language_loss": 0.84149116, "learning_rate": 0.0, "loss": 0.85167712, "num_input_tokens_seen": 430340944, "router_z_loss_mlp": 0.06033325, "routerloss_mlp": 0.0, "step": 5198, "time_per_iteration": 5.587369918823242 }, { "epoch": 1.0, "num_input_tokens_seen": 430340944, "step": 5198, "total_flos": 1.171926856433664e+16, "train_loss": 0.8587041911183526, "train_runtime": 15568.2077, "train_samples_per_second": 42.734, "train_steps_per_second": 0.334 } ], "logging_steps": 1.0, "max_steps": 5198, "num_input_tokens_seen": 430340944, "num_train_epochs": 1, "save_steps": 1040, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.171926856433664e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }