diff --git "a/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/trainer_state.json" "b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/trainer_state.json" new file mode 100644--- /dev/null +++ "b/sft_pretrain/Full_smoe_sharev3/checkpoint-2080/trainer_state.json" @@ -0,0 +1,31233 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4001539053482108, + "eval_steps": 500, + "global_step": 2080, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03964023, + "balance_loss_mlp": 3.01339984, + "epoch": 0.00019238168526356292, + "flos": 470464353792.0, + "grad_norm": 27.10233905437441, + "language_loss": 3.72295761, + "learning_rate": 0.0, + "loss": 2.48840261, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 9.5, + "step": 1, + "time_per_iteration": 29.606513023376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01906453, + "balance_loss_mlp": 1.25642872, + "epoch": 0.00038476337052712584, + "flos": 504311436288.0, + "grad_norm": 2.874173750989579, + "language_loss": 1.79264998, + "learning_rate": 0.00013726078121135892, + "loss": 1.81171465, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 6.5, + "step": 2, + "time_per_iteration": 2.7078208923339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01915611, + "balance_loss_mlp": 1.2667315, + "epoch": 0.0005771450557906887, + "flos": 598869282816.0, + "grad_norm": 2.1141296462778643, + "language_loss": 1.61429811, + "learning_rate": 0.00021755319103969496, + "loss": 1.63345432, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 6.48828125, + "step": 3, + "time_per_iteration": 3.010409116744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01917319, + "balance_loss_mlp": 1.26309848, + "epoch": 0.0007695267410542517, + "flos": 580133491200.0, + "grad_norm": 1.255159247360545, + "language_loss": 1.49202251, + "learning_rate": 0.00027452156242271784, + "loss": 1.51119578, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 6.54296875, + "step": 4, + "time_per_iteration": 2.7161622047424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0185163, + "balance_loss_mlp": 1.22144234, + "epoch": 0.0009619084263178145, + "flos": 485857861632.0, + "grad_norm": 4.267520959606063, + "language_loss": 1.57359505, + "learning_rate": 0.0003187096642208417, + "loss": 1.59211147, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 6.296875, + "step": 5, + "time_per_iteration": 2.718417167663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01828185, + "balance_loss_mlp": 1.21211123, + "epoch": 0.0011542901115813775, + "flos": 559744791552.0, + "grad_norm": 1.225349312557607, + "language_loss": 1.4752574, + "learning_rate": 0.0003548139722510539, + "loss": 1.49353933, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 6.15234375, + "step": 6, + "time_per_iteration": 2.6827874183654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01821666, + "balance_loss_mlp": 1.22428453, + "epoch": 0.0013466717968449403, + "flos": 533721014784.0, + "grad_norm": 0.5025899606895544, + "language_loss": 1.33846116, + "learning_rate": 0.00038533972973918044, + "loss": 1.35667801, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 5.96875, + "step": 7, + "time_per_iteration": 2.6889517307281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01776667, + "balance_loss_mlp": 1.2090404, + "epoch": 0.0015390534821085034, + "flos": 492037175808.0, + "grad_norm": 0.1719820928967348, + "language_loss": 1.2814672, + "learning_rate": 0.0004117823436340768, + "loss": 1.29923391, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 5.6875, + "step": 8, + "time_per_iteration": 2.7207248210906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0177577, + "balance_loss_mlp": 1.23217535, + "epoch": 0.0017314351673720662, + "flos": 564402207744.0, + "grad_norm": 0.6128716609675008, + "language_loss": 1.39861906, + "learning_rate": 0.00043510638207938993, + "loss": 1.41637683, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 5.44140625, + "step": 9, + "time_per_iteration": 2.887538194656372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01823371, + "balance_loss_mlp": 1.31334615, + "epoch": 0.001923816852635629, + "flos": 593132308992.0, + "grad_norm": 0.480897383035181, + "language_loss": 1.25963569, + "learning_rate": 0.00045597044543220066, + "loss": 1.27786922, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 5.09765625, + "step": 10, + "time_per_iteration": 2.7672832012176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01930298, + "balance_loss_mlp": 1.44621277, + "epoch": 0.002116198537899192, + "flos": 609308752896.0, + "grad_norm": 0.21803247425844502, + "language_loss": 1.22959518, + "learning_rate": 0.00047484428652143135, + "loss": 1.24889803, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 4.83203125, + "step": 11, + "time_per_iteration": 2.9771082401275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02130152, + "balance_loss_mlp": 1.67772901, + "epoch": 0.002308580223162755, + "flos": 544869075456.0, + "grad_norm": 0.19847359144835577, + "language_loss": 1.28057694, + "learning_rate": 0.0004920747534624128, + "loss": 1.30187845, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 4.52734375, + "step": 12, + "time_per_iteration": 2.6094090938568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02177014, + "balance_loss_mlp": 1.7512939, + "epoch": 0.002500961908426318, + "flos": 644458277376.0, + "grad_norm": 0.3126355826019607, + "language_loss": 1.29235363, + "learning_rate": 0.0005079252465375872, + "loss": 1.31412375, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 4.265625, + "step": 13, + "time_per_iteration": 2.841792345046997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02141221, + "balance_loss_mlp": 1.74411082, + "epoch": 0.0026933435936898806, + "flos": 487605312000.0, + "grad_norm": 0.282411779716686, + "language_loss": 1.17459798, + "learning_rate": 0.0005226005109505393, + "loss": 1.19601011, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 3.96875, + "step": 14, + "time_per_iteration": 2.597313165664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02024541, + "balance_loss_mlp": 1.65890288, + "epoch": 0.0028857252789534437, + "flos": 434368949760.0, + "grad_norm": 0.2583476739022616, + "language_loss": 1.22957516, + "learning_rate": 0.0005362628552605367, + "loss": 1.24982059, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 3.65234375, + "step": 15, + "time_per_iteration": 2.6388704776763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01790575, + "balance_loss_mlp": 1.44687057, + "epoch": 0.0030781069642170067, + "flos": 596465676288.0, + "grad_norm": 0.18613747071639053, + "language_loss": 1.27631426, + "learning_rate": 0.0005490431248454357, + "loss": 1.29421997, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 3.44140625, + "step": 16, + "time_per_iteration": 2.708346128463745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01779165, + "balance_loss_mlp": 1.46941185, + "epoch": 0.0032704886494805694, + "flos": 1537360432128.0, + "grad_norm": 0.2733785965407311, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.77484274, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 3.09375, + "step": 17, + "time_per_iteration": 6.916250705718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01553778, + "balance_loss_mlp": 1.24955583, + "epoch": 0.0034628703347441324, + "flos": 473720403456.0, + "grad_norm": 0.11658431553946913, + "language_loss": 1.14468098, + "learning_rate": 0.0005723671632907488, + "loss": 1.16021872, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 3.03710938, + "step": 18, + "time_per_iteration": 2.7716212272644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01490625, + "balance_loss_mlp": 1.21005416, + "epoch": 0.0036552520200076955, + "flos": 448303320576.0, + "grad_norm": 0.11552730485963776, + "language_loss": 1.19723654, + "learning_rate": 0.0005830738490244919, + "loss": 1.21214283, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 2.80859375, + "step": 19, + "time_per_iteration": 2.6067557334899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0141948, + "balance_loss_mlp": 1.16103387, + "epoch": 0.003847633705271258, + "flos": 635881148928.0, + "grad_norm": 0.11977740619668105, + "language_loss": 1.21676993, + "learning_rate": 0.0005932312266435596, + "loss": 1.23096466, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 2.58398438, + "step": 20, + "time_per_iteration": 2.8545703887939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01364308, + "balance_loss_mlp": 1.13084817, + "epoch": 0.004040015390534821, + "flos": 589222771200.0, + "grad_norm": 0.09935322828728523, + "language_loss": 1.16681409, + "learning_rate": 0.0006028929207788754, + "loss": 1.18045723, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 2.33203125, + "step": 21, + "time_per_iteration": 2.7119524478912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319718, + "balance_loss_mlp": 1.11038613, + "epoch": 0.004232397075798384, + "flos": 756253338624.0, + "grad_norm": 0.09023283304690737, + "language_loss": 1.20250762, + "learning_rate": 0.0006121050677327902, + "loss": 1.21570492, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 2.09667969, + "step": 22, + "time_per_iteration": 2.884739398956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304467, + "balance_loss_mlp": 1.1184051, + "epoch": 0.004424778761061947, + "flos": 526434439680.0, + "grad_norm": 0.08559602389751407, + "language_loss": 1.10067201, + "learning_rate": 0.0006209076479463684, + "loss": 1.1137166, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 1.85839844, + "step": 23, + "time_per_iteration": 2.6616718769073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275434, + "balance_loss_mlp": 1.10787356, + "epoch": 0.00461716044632551, + "flos": 547907079168.0, + "grad_norm": 0.07141137445072718, + "language_loss": 1.2012924, + "learning_rate": 0.0006293355346737718, + "loss": 1.21404672, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 1.67675781, + "step": 24, + "time_per_iteration": 2.7025952339172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252583, + "balance_loss_mlp": 1.10476315, + "epoch": 0.004809542131589073, + "flos": 567293234688.0, + "grad_norm": 0.08524381015789384, + "language_loss": 1.16738653, + "learning_rate": 0.0006374193284416834, + "loss": 1.17991233, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 1.47753906, + "step": 25, + "time_per_iteration": 2.827439069747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223638, + "balance_loss_mlp": 1.0984205, + "epoch": 0.005001923816852636, + "flos": 470391418368.0, + "grad_norm": 0.08512374611478205, + "language_loss": 1.15399337, + "learning_rate": 0.0006451860277489461, + "loss": 1.16622972, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 1.25097656, + "step": 26, + "time_per_iteration": 2.6214864253997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206141, + "balance_loss_mlp": 1.10009253, + "epoch": 0.005194305502116198, + "flos": 415283950080.0, + "grad_norm": 0.07774032731783902, + "language_loss": 1.23061514, + "learning_rate": 0.0006526595731190848, + "loss": 1.2426765, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 1.0625, + "step": 27, + "time_per_iteration": 2.5637125968933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117904, + "balance_loss_mlp": 1.09192181, + "epoch": 0.005386687187379761, + "flos": 628466535936.0, + "grad_norm": 0.05524077436438855, + "language_loss": 1.1626848, + "learning_rate": 0.0006598612921618983, + "loss": 1.17447519, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 0.87158203, + "step": 28, + "time_per_iteration": 2.8202784061431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159441, + "balance_loss_mlp": 1.08772469, + "epoch": 0.005579068872643324, + "flos": 886100332032.0, + "grad_norm": 0.07386109802626846, + "language_loss": 1.08505416, + "learning_rate": 0.0006668102665011454, + "loss": 1.09664845, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 0.71728516, + "step": 29, + "time_per_iteration": 3.2254040241241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154142, + "balance_loss_mlp": 1.09520459, + "epoch": 0.005771450557906887, + "flos": 547287238656.0, + "grad_norm": 0.0797557646441396, + "language_loss": 1.18077409, + "learning_rate": 0.0006735236364718957, + "loss": 1.19231534, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 0.58886719, + "step": 30, + "time_per_iteration": 2.6730945110321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140737, + "balance_loss_mlp": 1.09384, + "epoch": 0.00596383224317045, + "flos": 531766950912.0, + "grad_norm": 0.060827451674393726, + "language_loss": 1.1687839, + "learning_rate": 0.0006800168558381346, + "loss": 1.18019128, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 0.46875, + "step": 31, + "time_per_iteration": 2.649216651916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148736, + "balance_loss_mlp": 1.11166239, + "epoch": 0.0061562139284340135, + "flos": 588813926400.0, + "grad_norm": 0.10592463777190406, + "language_loss": 1.19211543, + "learning_rate": 0.0006863039060567947, + "loss": 1.20360279, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 0.37084961, + "step": 32, + "time_per_iteration": 2.6697018146514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132499, + "balance_loss_mlp": 1.10136151, + "epoch": 0.006348595613697576, + "flos": 617929551360.0, + "grad_norm": 0.09812744917576391, + "language_loss": 1.1217525, + "learning_rate": 0.0006923974775611263, + "loss": 1.13307738, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 0.3112793, + "step": 33, + "time_per_iteration": 2.770225763320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137532, + "balance_loss_mlp": 1.11146092, + "epoch": 0.006540977298961139, + "flos": 777564444672.0, + "grad_norm": 0.06513543096417564, + "language_loss": 1.08375585, + "learning_rate": 0.0006983091239737814, + "loss": 1.09513116, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 0.26086426, + "step": 34, + "time_per_iteration": 2.99418306350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128276, + "balance_loss_mlp": 1.10578084, + "epoch": 0.006733358984224702, + "flos": 666837356544.0, + "grad_norm": 0.06344935516817307, + "language_loss": 1.07062221, + "learning_rate": 0.0007040493939600222, + "loss": 1.08190489, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 0.22497559, + "step": 35, + "time_per_iteration": 2.9126057624816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119708, + "balance_loss_mlp": 1.09892988, + "epoch": 0.006925740669488265, + "flos": 564092287488.0, + "grad_norm": 0.06579143759664555, + "language_loss": 1.07960629, + "learning_rate": 0.0007096279445021078, + "loss": 1.09080338, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 0.20788574, + "step": 36, + "time_per_iteration": 2.7079102993011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114855, + "balance_loss_mlp": 1.09574544, + "epoch": 0.007118122354751828, + "flos": 549583156224.0, + "grad_norm": 0.14799474820221378, + "language_loss": 1.14634764, + "learning_rate": 0.0007150536386503726, + "loss": 1.15749621, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 0.19104004, + "step": 37, + "time_per_iteration": 2.8290467262268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104011, + "balance_loss_mlp": 1.08533084, + "epoch": 0.007310504040015391, + "flos": 702161409024.0, + "grad_norm": 0.2513092385422617, + "language_loss": 1.08396375, + "learning_rate": 0.0007203346302358509, + "loss": 1.09500384, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 0.18688965, + "step": 38, + "time_per_iteration": 2.961430311203003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121274, + "balance_loss_mlp": 1.10231924, + "epoch": 0.007502885725278953, + "flos": 599022051840.0, + "grad_norm": 0.0999674626629785, + "language_loss": 1.11391926, + "learning_rate": 0.000725478437577282, + "loss": 1.12513208, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 0.18945312, + "step": 39, + "time_per_iteration": 2.742088556289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146989, + "balance_loss_mlp": 1.12810588, + "epoch": 0.007695267410542516, + "flos": 560000867328.0, + "grad_norm": 0.3323772184023467, + "language_loss": 1.08355689, + "learning_rate": 0.0007304920078549186, + "loss": 1.09502685, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 0.18884277, + "step": 40, + "time_per_iteration": 2.66943621635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116486, + "balance_loss_mlp": 1.1452378, + "epoch": 0.007887649095806078, + "flos": 507906671616.0, + "grad_norm": 0.11539272036457353, + "language_loss": 1.09356606, + "learning_rate": 0.0007353817735343603, + "loss": 1.1052146, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 0.19604492, + "step": 41, + "time_per_iteration": 2.7052595615386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132998, + "balance_loss_mlp": 1.11293542, + "epoch": 0.008080030781069641, + "flos": 503642133504.0, + "grad_norm": 0.12251683576194117, + "language_loss": 1.04851842, + "learning_rate": 0.0007401537019902344, + "loss": 1.05984843, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 0.20056152, + "step": 42, + "time_per_iteration": 2.590432643890381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124507, + "balance_loss_mlp": 1.10198867, + "epoch": 0.008272412466333205, + "flos": 517764178944.0, + "grad_norm": 0.09393858903586973, + "language_loss": 1.08539796, + "learning_rate": 0.0007448133392900729, + "loss": 1.09664297, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 0.22521973, + "step": 43, + "time_per_iteration": 2.6619081497192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112544, + "balance_loss_mlp": 1.10156202, + "epoch": 0.008464794151596768, + "flos": 607673373696.0, + "grad_norm": 0.06822323064374927, + "language_loss": 1.03845203, + "learning_rate": 0.0007493658489441491, + "loss": 1.04970646, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 0.23864746, + "step": 44, + "time_per_iteration": 2.861008644104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128905, + "balance_loss_mlp": 1.10477662, + "epoch": 0.00865717583686033, + "flos": 537661075968.0, + "grad_norm": 0.1413166066405165, + "language_loss": 1.08820629, + "learning_rate": 0.0007538160463002316, + "loss": 1.09949529, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 0.2409668, + "step": 45, + "time_per_iteration": 2.643458604812622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115676, + "balance_loss_mlp": 1.13258433, + "epoch": 0.008849557522123894, + "flos": 507758284800.0, + "grad_norm": 0.08570115972640321, + "language_loss": 1.10720444, + "learning_rate": 0.0007581684291577274, + "loss": 1.11877203, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 0.24157715, + "step": 46, + "time_per_iteration": 2.5904788970947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145761, + "balance_loss_mlp": 1.12085772, + "epoch": 0.009041939207387457, + "flos": 625048800768.0, + "grad_norm": 0.06636849455276843, + "language_loss": 1.14156199, + "learning_rate": 0.0007624272050891776, + "loss": 1.15301955, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.24902344, + "step": 47, + "time_per_iteration": 2.782179594039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154374, + "balance_loss_mlp": 1.12759995, + "epoch": 0.00923432089265102, + "flos": 549124849152.0, + "grad_norm": 0.09356522507451794, + "language_loss": 1.04615343, + "learning_rate": 0.0007665963158851307, + "loss": 1.05769718, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.26806641, + "step": 48, + "time_per_iteration": 2.824540138244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174661, + "balance_loss_mlp": 1.14738548, + "epoch": 0.009426702577914583, + "flos": 562202242560.0, + "grad_norm": 0.059100241584136314, + "language_loss": 1.12381458, + "learning_rate": 0.0007706794594783609, + "loss": 1.13556111, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.27270508, + "step": 49, + "time_per_iteration": 2.790757894515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192673, + "balance_loss_mlp": 1.16604137, + "epoch": 0.009619084263178146, + "flos": 616486228992.0, + "grad_norm": 0.08074806779925832, + "language_loss": 1.11280799, + "learning_rate": 0.0007746801096530423, + "loss": 1.12473488, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.2668457, + "step": 50, + "time_per_iteration": 2.7235305309295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116178, + "balance_loss_mlp": 1.135149, + "epoch": 0.009811465948441709, + "flos": 541176325632.0, + "grad_norm": 0.06558886342971224, + "language_loss": 1.16576111, + "learning_rate": 0.0007786015338021173, + "loss": 1.17737889, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.26672363, + "step": 51, + "time_per_iteration": 2.6817519664764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134628, + "balance_loss_mlp": 1.1085453, + "epoch": 0.010003847633705272, + "flos": 535608087552.0, + "grad_norm": 0.06210449580458492, + "language_loss": 1.08870959, + "learning_rate": 0.0007824468089603051, + "loss": 1.10005593, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.26098633, + "step": 52, + "time_per_iteration": 2.644577980041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125522, + "balance_loss_mlp": 1.09910512, + "epoch": 0.010196229318968833, + "flos": 908867907072.0, + "grad_norm": 0.05864822926220488, + "language_loss": 1.07807887, + "learning_rate": 0.0007862188363098669, + "loss": 1.08933413, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.26428223, + "step": 53, + "time_per_iteration": 3.1450047492980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126237, + "balance_loss_mlp": 1.10084558, + "epoch": 0.010388611004232396, + "flos": 585594040320.0, + "grad_norm": 0.07974065634267835, + "language_loss": 1.08295977, + "learning_rate": 0.0007899203543304438, + "loss": 1.09422219, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.25390625, + "step": 54, + "time_per_iteration": 2.6822280883789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155972, + "balance_loss_mlp": 1.13315582, + "epoch": 0.01058099268949596, + "flos": 502233716736.0, + "grad_norm": 0.07014139109577967, + "language_loss": 1.22212756, + "learning_rate": 0.0007935539507422731, + "loss": 1.23368728, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.22814941, + "step": 55, + "time_per_iteration": 2.5841405391693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117516, + "balance_loss_mlp": 1.153512, + "epoch": 0.010773374374759523, + "flos": 544170659328.0, + "grad_norm": 0.07006342440897594, + "language_loss": 1.13914931, + "learning_rate": 0.0007971220733732573, + "loss": 1.15090084, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.21643066, + "step": 56, + "time_per_iteration": 2.697427988052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193099, + "balance_loss_mlp": 1.17267895, + "epoch": 0.010965756060023086, + "flos": 525874235904.0, + "grad_norm": 0.08125896119424647, + "language_loss": 1.0764755, + "learning_rate": 0.0008006270400641869, + "loss": 1.08840656, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.2043457, + "step": 57, + "time_per_iteration": 2.723154306411743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174019, + "balance_loss_mlp": 1.15412247, + "epoch": 0.011158137745286649, + "flos": 576653147136.0, + "grad_norm": 0.07485866075688756, + "language_loss": 1.09104013, + "learning_rate": 0.0008040710477125043, + "loss": 1.10278034, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.19897461, + "step": 58, + "time_per_iteration": 2.703186273574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153983, + "balance_loss_mlp": 1.13440859, + "epoch": 0.011350519430550212, + "flos": 529024310784.0, + "grad_norm": 0.06764829366941465, + "language_loss": 1.09780312, + "learning_rate": 0.0008074561805429771, + "loss": 1.10934305, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.19567871, + "step": 59, + "time_per_iteration": 2.6111674308776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136624, + "balance_loss_mlp": 1.11676335, + "epoch": 0.011542901115813775, + "flos": 555608291328.0, + "grad_norm": 0.06986870516034673, + "language_loss": 1.08079648, + "learning_rate": 0.0008107844176832545, + "loss": 1.09216261, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.19848633, + "step": 60, + "time_per_iteration": 2.682687997817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125651, + "balance_loss_mlp": 1.1056236, + "epoch": 0.011735282801077338, + "flos": 571826995200.0, + "grad_norm": 0.061548073586970495, + "language_loss": 1.09071934, + "learning_rate": 0.0008140576401132568, + "loss": 1.10197592, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.20019531, + "step": 61, + "time_per_iteration": 2.639394760131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111743, + "balance_loss_mlp": 1.09838021, + "epoch": 0.0119276644863409, + "flos": 615309156864.0, + "grad_norm": 0.06273761556608791, + "language_loss": 1.10558033, + "learning_rate": 0.0008172776370494935, + "loss": 1.11675453, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.19030762, + "step": 62, + "time_per_iteration": 2.7110230922698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134294, + "balance_loss_mlp": 1.11483955, + "epoch": 0.012120046171604464, + "flos": 500835474432.0, + "grad_norm": 0.07391589684249159, + "language_loss": 1.17346644, + "learning_rate": 0.0008204461118185703, + "loss": 1.18480933, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.19445801, + "step": 63, + "time_per_iteration": 2.5490689277648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142708, + "balance_loss_mlp": 1.12420678, + "epoch": 0.012312427856868027, + "flos": 473109327360.0, + "grad_norm": 0.05825974543220343, + "language_loss": 1.06081367, + "learning_rate": 0.0008235646872681536, + "loss": 1.07224083, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.18505859, + "step": 64, + "time_per_iteration": 2.5874247550964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139504, + "balance_loss_mlp": 1.12069249, + "epoch": 0.012504809542131588, + "flos": 538094651904.0, + "grad_norm": 0.1040066778051144, + "language_loss": 1.06503749, + "learning_rate": 0.0008266349107584288, + "loss": 1.07643247, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.18823242, + "step": 65, + "time_per_iteration": 2.678736925125122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123492, + "balance_loss_mlp": 1.10500288, + "epoch": 0.012697191227395151, + "flos": 608450365440.0, + "grad_norm": 0.09066354406474254, + "language_loss": 1.09410381, + "learning_rate": 0.0008296582587724851, + "loss": 1.10533869, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.18481445, + "step": 66, + "time_per_iteration": 2.6937255859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121105, + "balance_loss_mlp": 1.10255599, + "epoch": 0.012889572912658714, + "flos": 767750607360.0, + "grad_norm": 0.11790618145169461, + "language_loss": 1.07982886, + "learning_rate": 0.0008326361411800136, + "loss": 1.0910399, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.1854248, + "step": 67, + "time_per_iteration": 2.9377663135528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096346, + "balance_loss_mlp": 1.07871521, + "epoch": 0.013081954597922277, + "flos": 533604561408.0, + "grad_norm": 0.09153807632987658, + "language_loss": 1.08335972, + "learning_rate": 0.0008355699051851403, + "loss": 1.09432316, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.17651367, + "step": 68, + "time_per_iteration": 2.7278473377227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104023, + "balance_loss_mlp": 1.0865227, + "epoch": 0.01327433628318584, + "flos": 572826567168.0, + "grad_norm": 0.08317322449907456, + "language_loss": 1.14837921, + "learning_rate": 0.0008384608389860635, + "loss": 1.15941942, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.1751709, + "step": 69, + "time_per_iteration": 2.7238211631774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111418, + "balance_loss_mlp": 1.09424019, + "epoch": 0.013466717968449404, + "flos": 497029243392.0, + "grad_norm": 0.08213812906773327, + "language_loss": 1.04970825, + "learning_rate": 0.000841310175171381, + "loss": 1.06082237, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.17199707, + "step": 70, + "time_per_iteration": 2.578726291656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101987, + "balance_loss_mlp": 1.08526158, + "epoch": 0.013659099653712967, + "flos": 565234454016.0, + "grad_norm": 0.06358988870017376, + "language_loss": 1.03380442, + "learning_rate": 0.000844119093875517, + "loss": 1.04482436, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.16723633, + "step": 71, + "time_per_iteration": 2.692791223526001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103533, + "balance_loss_mlp": 1.08689094, + "epoch": 0.01385148133897653, + "flos": 573540950016.0, + "grad_norm": 0.07461407963015444, + "language_loss": 1.08098376, + "learning_rate": 0.0008468887257134666, + "loss": 1.09201908, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.16650391, + "step": 72, + "time_per_iteration": 2.6599459648132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122889, + "balance_loss_mlp": 1.10587776, + "epoch": 0.014043863024240093, + "flos": 576539665920.0, + "grad_norm": 0.05931650266846123, + "language_loss": 1.10316896, + "learning_rate": 0.0008496201545131264, + "loss": 1.11439776, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.17028809, + "step": 73, + "time_per_iteration": 2.7093684673309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126213, + "balance_loss_mlp": 1.10950017, + "epoch": 0.014236244709503656, + "flos": 938287660032.0, + "grad_norm": 0.060718352480344094, + "language_loss": 1.08902812, + "learning_rate": 0.0008523144198617317, + "loss": 1.1002903, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.16711426, + "step": 74, + "time_per_iteration": 3.1743276119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125614, + "balance_loss_mlp": 1.10876918, + "epoch": 0.014428626394767219, + "flos": 528231352320.0, + "grad_norm": 0.07198154728214846, + "language_loss": 1.08249164, + "learning_rate": 0.0008549725194813783, + "loss": 1.09374774, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.1685791, + "step": 75, + "time_per_iteration": 2.630387783050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106727, + "balance_loss_mlp": 1.09047866, + "epoch": 0.014621008080030782, + "flos": 803371433472.0, + "grad_norm": 0.07553700512989577, + "language_loss": 1.06998253, + "learning_rate": 0.0008575954114472099, + "loss": 1.0810498, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.16247559, + "step": 76, + "time_per_iteration": 3.134385347366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109532, + "balance_loss_mlp": 1.0933075, + "epoch": 0.014813389765294343, + "flos": 696588788736.0, + "grad_norm": 0.053440596513601155, + "language_loss": 1.05069363, + "learning_rate": 0.0008601840162606118, + "loss": 1.06178904, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.16223145, + "step": 77, + "time_per_iteration": 3.039991855621338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123171, + "balance_loss_mlp": 1.10660076, + "epoch": 0.015005771450557906, + "flos": 596702813184.0, + "grad_norm": 0.07894951514499118, + "language_loss": 1.1143651, + "learning_rate": 0.000862739218788641, + "loss": 1.12559676, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.16577148, + "step": 78, + "time_per_iteration": 2.867741346359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113228, + "balance_loss_mlp": 1.11553121, + "epoch": 0.01519815313582147, + "flos": 549148170240.0, + "grad_norm": 0.0893413961860561, + "language_loss": 1.07743871, + "learning_rate": 0.0008652618700799138, + "loss": 1.08876157, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.16760254, + "step": 79, + "time_per_iteration": 2.675795555114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160152, + "balance_loss_mlp": 1.14348662, + "epoch": 0.015390534821085032, + "flos": 430306642944.0, + "grad_norm": 0.06679936706529424, + "language_loss": 1.07125092, + "learning_rate": 0.0008677527890662774, + "loss": 1.08285248, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.16662598, + "step": 80, + "time_per_iteration": 2.4765963554382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196819, + "balance_loss_mlp": 1.17889023, + "epoch": 0.015582916506348595, + "flos": 523854743040.0, + "grad_norm": 0.12362960542988827, + "language_loss": 1.09903598, + "learning_rate": 0.0008702127641587799, + "loss": 1.11100423, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.17932129, + "step": 81, + "time_per_iteration": 2.636688470840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180455, + "balance_loss_mlp": 1.16288388, + "epoch": 0.015775298191612157, + "flos": 575151598080.0, + "grad_norm": 0.08274533442322421, + "language_loss": 1.04032063, + "learning_rate": 0.0008726425547457192, + "loss": 1.05212522, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.17565918, + "step": 82, + "time_per_iteration": 2.765179395675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157804, + "balance_loss_mlp": 1.14051914, + "epoch": 0.01596767987687572, + "flos": 610040664576.0, + "grad_norm": 0.07618339381967684, + "language_loss": 1.03921247, + "learning_rate": 0.0008750428925998964, + "loss": 1.05079055, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.1730957, + "step": 83, + "time_per_iteration": 2.7615418434143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159673, + "balance_loss_mlp": 1.14280462, + "epoch": 0.016160061562139283, + "flos": 566864040960.0, + "grad_norm": 0.0706757922791228, + "language_loss": 1.09743476, + "learning_rate": 0.0008774144832015932, + "loss": 1.10903156, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.16882324, + "step": 84, + "time_per_iteration": 2.694364070892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01699218, + "balance_loss_mlp": 1.68252861, + "epoch": 0.016352443247402846, + "flos": 1410557234688.0, + "grad_norm": 0.23342967148410274, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76473522, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.16699219, + "step": 85, + "time_per_iteration": 4.599137306213379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212855, + "balance_loss_mlp": 1.19580793, + "epoch": 0.01654482493266641, + "flos": 730177127424.0, + "grad_norm": 0.09253845479208671, + "language_loss": 1.04518116, + "learning_rate": 0.0008820741205014318, + "loss": 1.05730963, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.1706543, + "step": 86, + "time_per_iteration": 2.8595266342163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246652, + "balance_loss_mlp": 1.22939014, + "epoch": 0.016737206617929972, + "flos": 536016932352.0, + "grad_norm": 0.10044068584300966, + "language_loss": 1.06437612, + "learning_rate": 0.0008843634575408404, + "loss": 1.07684278, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.17248535, + "step": 87, + "time_per_iteration": 2.690492630004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215448, + "balance_loss_mlp": 1.19887805, + "epoch": 0.016929588303193535, + "flos": 536706584064.0, + "grad_norm": 0.0661610487366718, + "language_loss": 1.07674646, + "learning_rate": 0.0008866266301555082, + "loss": 1.08890104, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.16577148, + "step": 88, + "time_per_iteration": 2.737339496612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203027, + "balance_loss_mlp": 1.18706512, + "epoch": 0.017121969988457098, + "flos": 526498458624.0, + "grad_norm": 0.07897226836222233, + "language_loss": 1.08543992, + "learning_rate": 0.0008888642296509615, + "loss": 1.09747016, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.1595459, + "step": 89, + "time_per_iteration": 2.576819658279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187655, + "balance_loss_mlp": 1.17131162, + "epoch": 0.01731435167372066, + "flos": 625304876544.0, + "grad_norm": 0.0740353605135553, + "language_loss": 1.13367987, + "learning_rate": 0.0008910768275115906, + "loss": 1.14555645, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.16345215, + "step": 90, + "time_per_iteration": 2.778571128845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173459, + "balance_loss_mlp": 1.15692425, + "epoch": 0.017506733358984224, + "flos": 496157709312.0, + "grad_norm": 0.07518713147028631, + "language_loss": 1.08794332, + "learning_rate": 0.0008932649762767675, + "loss": 1.0996778, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.16540527, + "step": 91, + "time_per_iteration": 2.5931665897369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185544, + "balance_loss_mlp": 1.16881919, + "epoch": 0.017699115044247787, + "flos": 745613047296.0, + "grad_norm": 0.07711429280558382, + "language_loss": 1.11576343, + "learning_rate": 0.0008954292103690864, + "loss": 1.12761879, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.1673584, + "step": 92, + "time_per_iteration": 2.9129488468170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194769, + "balance_loss_mlp": 1.17854476, + "epoch": 0.01789149672951135, + "flos": 515257265664.0, + "grad_norm": 0.0669718610224715, + "language_loss": 1.1343056, + "learning_rate": 0.0008975700468778296, + "loss": 1.14625335, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.16223145, + "step": 93, + "time_per_iteration": 2.576620101928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216953, + "balance_loss_mlp": 1.20076382, + "epoch": 0.018083878414774913, + "flos": 585850116096.0, + "grad_norm": 0.11698648494194364, + "language_loss": 1.0652318, + "learning_rate": 0.0008996879863005366, + "loss": 1.07740128, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.16186523, + "step": 94, + "time_per_iteration": 2.6751108169555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217025, + "balance_loss_mlp": 1.2013253, + "epoch": 0.018276260100038477, + "flos": 497103436800.0, + "grad_norm": 0.08327491501556071, + "language_loss": 1.06208014, + "learning_rate": 0.0009017835132453337, + "loss": 1.07425046, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.15686035, + "step": 95, + "time_per_iteration": 2.5971803665161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196463, + "balance_loss_mlp": 1.1804409, + "epoch": 0.01846864178530204, + "flos": 639765955584.0, + "grad_norm": 0.09756000368948786, + "language_loss": 1.06920743, + "learning_rate": 0.0009038570970964896, + "loss": 1.08117199, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.16027832, + "step": 96, + "time_per_iteration": 2.7428832054138184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173361, + "balance_loss_mlp": 1.15723228, + "epoch": 0.018661023470565603, + "flos": 511411746816.0, + "grad_norm": 0.07053433913024812, + "language_loss": 1.04343212, + "learning_rate": 0.0009059091926454854, + "loss": 1.05516577, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.16125488, + "step": 97, + "time_per_iteration": 2.570509433746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178324, + "balance_loss_mlp": 1.16246903, + "epoch": 0.018853405155829166, + "flos": 930710103552.0, + "grad_norm": 0.08767892767743933, + "language_loss": 1.03389072, + "learning_rate": 0.0009079402406897198, + "loss": 1.04567385, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.15844727, + "step": 98, + "time_per_iteration": 3.202298164367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179075, + "balance_loss_mlp": 1.16296983, + "epoch": 0.01904578684109273, + "flos": 576209396736.0, + "grad_norm": 0.2639136557883628, + "language_loss": 1.0596242, + "learning_rate": 0.0009099506686008212, + "loss": 1.07141495, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.16101074, + "step": 99, + "time_per_iteration": 2.812368869781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139923, + "balance_loss_mlp": 1.12423468, + "epoch": 0.019238168526356292, + "flos": 558173431296.0, + "grad_norm": 0.12311670746354397, + "language_loss": 1.08180976, + "learning_rate": 0.0009119408908644013, + "loss": 1.09320903, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.15673828, + "step": 100, + "time_per_iteration": 2.7063775062561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150815, + "balance_loss_mlp": 1.13574743, + "epoch": 0.019430550211619855, + "flos": 723539506176.0, + "grad_norm": 0.12127606313133317, + "language_loss": 1.14121008, + "learning_rate": 0.0009139113095929519, + "loss": 1.15271831, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.15039062, + "step": 101, + "time_per_iteration": 2.840913772583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218173, + "balance_loss_mlp": 1.20243776, + "epoch": 0.019622931896883418, + "flos": 499235000832.0, + "grad_norm": 0.1104247345061639, + "language_loss": 1.0836457, + "learning_rate": 0.0009158623150134762, + "loss": 1.09582746, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.15722656, + "step": 102, + "time_per_iteration": 2.560464859008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01357908, + "balance_loss_mlp": 1.34204173, + "epoch": 0.01981531358214698, + "flos": 508916418048.0, + "grad_norm": 0.15164768975642337, + "language_loss": 1.07661259, + "learning_rate": 0.000917794285931332, + "loss": 1.0901916, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.15856934, + "step": 103, + "time_per_iteration": 2.6684353351593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381196, + "balance_loss_mlp": 1.36572242, + "epoch": 0.020007695267410544, + "flos": 521087371776.0, + "grad_norm": 0.10342928287682196, + "language_loss": 0.9971087, + "learning_rate": 0.0009197075901716639, + "loss": 1.01092052, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.15454102, + "step": 104, + "time_per_iteration": 2.7250871658325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01356986, + "balance_loss_mlp": 1.34017777, + "epoch": 0.020200076952674107, + "flos": 533013834240.0, + "grad_norm": 0.1824265866479698, + "language_loss": 1.09647703, + "learning_rate": 0.0009216025849997171, + "loss": 1.11004686, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.16809082, + "step": 105, + "time_per_iteration": 2.776764154434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261961, + "balance_loss_mlp": 1.24583197, + "epoch": 0.020392458637937667, + "flos": 684430981632.0, + "grad_norm": 0.06376163654280764, + "language_loss": 1.0425086, + "learning_rate": 0.0009234796175212258, + "loss": 1.05512834, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.16125488, + "step": 106, + "time_per_iteration": 2.9174978733062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269614, + "balance_loss_mlp": 1.25201869, + "epoch": 0.02058484032320123, + "flos": 701791852032.0, + "grad_norm": 0.060044663360548714, + "language_loss": 1.08808422, + "learning_rate": 0.000925339025064007, + "loss": 1.10078037, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.17590332, + "step": 107, + "time_per_iteration": 2.975735902786255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324579, + "balance_loss_mlp": 1.30547023, + "epoch": 0.020777222008464793, + "flos": 638772175872.0, + "grad_norm": 0.12680512225677842, + "language_loss": 1.01262307, + "learning_rate": 0.0009271811355418027, + "loss": 1.02586877, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.19128418, + "step": 108, + "time_per_iteration": 2.8408150672912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01306621, + "balance_loss_mlp": 1.28755951, + "epoch": 0.020969603693728356, + "flos": 681785856000.0, + "grad_norm": 0.06997483982989385, + "language_loss": 1.08551693, + "learning_rate": 0.0009290062678013548, + "loss": 1.09858322, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.19055176, + "step": 109, + "time_per_iteration": 2.869980812072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309468, + "balance_loss_mlp": 1.29159832, + "epoch": 0.02116198537899192, + "flos": 533140462080.0, + "grad_norm": 0.13190855435004306, + "language_loss": 1.06647623, + "learning_rate": 0.0009308147319536321, + "loss": 1.07957077, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.17895508, + "step": 110, + "time_per_iteration": 2.6270735263824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130688, + "balance_loss_mlp": 1.29067969, + "epoch": 0.021354367064255482, + "flos": 717168135168.0, + "grad_norm": 0.10963649287068344, + "language_loss": 1.1282903, + "learning_rate": 0.0009326068296900676, + "loss": 1.14135909, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.1619873, + "step": 111, + "time_per_iteration": 2.8845341205596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01388527, + "balance_loss_mlp": 1.37200487, + "epoch": 0.021546748749519045, + "flos": 519290459136.0, + "grad_norm": 0.12406482447985402, + "language_loss": 1.03902006, + "learning_rate": 0.0009343828545846161, + "loss": 1.05290532, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.16516113, + "step": 112, + "time_per_iteration": 2.8167102336883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01548404, + "balance_loss_mlp": 1.53109479, + "epoch": 0.021739130434782608, + "flos": 504912337920.0, + "grad_norm": 0.2528517188051562, + "language_loss": 1.0722419, + "learning_rate": 0.0009361430923823841, + "loss": 1.08772588, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.1730957, + "step": 113, + "time_per_iteration": 2.664581060409546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01441472, + "balance_loss_mlp": 1.42576015, + "epoch": 0.02193151212004617, + "flos": 463251820032.0, + "grad_norm": 0.1910881492312462, + "language_loss": 1.11420846, + "learning_rate": 0.0009378878212755459, + "loss": 1.12862325, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.15710449, + "step": 114, + "time_per_iteration": 2.4851133823394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262203, + "balance_loss_mlp": 1.24767148, + "epoch": 0.022123893805309734, + "flos": 552008673792.0, + "grad_norm": 0.09004287588953173, + "language_loss": 1.0099957, + "learning_rate": 0.0009396173121672103, + "loss": 1.0226177, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.14538574, + "step": 115, + "time_per_iteration": 2.6535162925720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215709, + "balance_loss_mlp": 1.20165396, + "epoch": 0.022316275490573297, + "flos": 635920436736.0, + "grad_norm": 0.07849561533847389, + "language_loss": 1.07122314, + "learning_rate": 0.0009413318289238633, + "loss": 1.08338022, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.14050293, + "step": 116, + "time_per_iteration": 2.7836899757385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203544, + "balance_loss_mlp": 1.18965602, + "epoch": 0.02250865717583686, + "flos": 798535107072.0, + "grad_norm": 0.07099947506123377, + "language_loss": 0.98912275, + "learning_rate": 0.0009430316286169771, + "loss": 1.00115824, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.13891602, + "step": 117, + "time_per_iteration": 3.049468517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263206, + "balance_loss_mlp": 1.24786401, + "epoch": 0.022701038861100423, + "flos": 455851763712.0, + "grad_norm": 0.18808502465815918, + "language_loss": 1.04843259, + "learning_rate": 0.0009447169617543361, + "loss": 1.06106472, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.15319824, + "step": 118, + "time_per_iteration": 2.5886504650115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121023, + "balance_loss_mlp": 1.19557953, + "epoch": 0.022893420546363986, + "flos": 582812112384.0, + "grad_norm": 0.09179634719817005, + "language_loss": 1.11139297, + "learning_rate": 0.0009463880725016029, + "loss": 1.12349522, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.14648438, + "step": 119, + "time_per_iteration": 2.6861259937286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169264, + "balance_loss_mlp": 1.15572226, + "epoch": 0.02308580223162755, + "flos": 561010613760.0, + "grad_norm": 0.09164108943144146, + "language_loss": 1.05675769, + "learning_rate": 0.0009480451988946134, + "loss": 1.06845045, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.13549805, + "step": 120, + "time_per_iteration": 2.8075129985809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217336, + "balance_loss_mlp": 1.2034359, + "epoch": 0.023278183916891113, + "flos": 770966111232.0, + "grad_norm": 0.1019945076921087, + "language_loss": 1.07486713, + "learning_rate": 0.0009496885730428627, + "loss": 1.08704054, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.13903809, + "step": 121, + "time_per_iteration": 3.0081264972686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291544, + "balance_loss_mlp": 1.27698815, + "epoch": 0.023470565602154676, + "flos": 553111552512.0, + "grad_norm": 0.08478902086488087, + "language_loss": 1.05369067, + "learning_rate": 0.0009513184213246156, + "loss": 1.06660616, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.14550781, + "step": 122, + "time_per_iteration": 2.654902696609497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406128, + "balance_loss_mlp": 1.39054775, + "epoch": 0.02366294728741824, + "flos": 559744791552.0, + "grad_norm": 0.09837859270685317, + "language_loss": 1.09463692, + "learning_rate": 0.0009529349645740552, + "loss": 1.10869825, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.15563965, + "step": 123, + "time_per_iteration": 2.6837081909179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01484693, + "balance_loss_mlp": 1.46961284, + "epoch": 0.0238553289726818, + "flos": 468313698816.0, + "grad_norm": 0.11388616458843728, + "language_loss": 1.07573724, + "learning_rate": 0.0009545384182608524, + "loss": 1.09058416, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.1505127, + "step": 124, + "time_per_iteration": 2.5069937705993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01411359, + "balance_loss_mlp": 1.39688659, + "epoch": 0.024047710657945365, + "flos": 559763730432.0, + "grad_norm": 0.3429043048666504, + "language_loss": 1.05057025, + "learning_rate": 0.0009561289926625252, + "loss": 1.06468379, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.14465332, + "step": 125, + "time_per_iteration": 2.6802117824554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011688, + "balance_loss_mlp": 1.15507352, + "epoch": 0.024240092343208928, + "flos": 504528224256.0, + "grad_norm": 0.18048320350440872, + "language_loss": 1.09737623, + "learning_rate": 0.0009577068930299292, + "loss": 1.10906434, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.13739014, + "step": 126, + "time_per_iteration": 2.6096670627593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163735, + "balance_loss_mlp": 1.15040147, + "epoch": 0.02443247402847249, + "flos": 435516908544.0, + "grad_norm": 0.07278748671530755, + "language_loss": 1.05931616, + "learning_rate": 0.0009592723197462087, + "loss": 1.07095349, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.13360596, + "step": 127, + "time_per_iteration": 2.6409482955932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248107, + "balance_loss_mlp": 1.23239577, + "epoch": 0.024624855713736054, + "flos": 683445966336.0, + "grad_norm": 0.0813490266373729, + "language_loss": 1.02871299, + "learning_rate": 0.0009608254684795125, + "loss": 1.04119396, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.15710449, + "step": 128, + "time_per_iteration": 2.940600872039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265693, + "balance_loss_mlp": 1.24772859, + "epoch": 0.024817237398999614, + "flos": 524721894912.0, + "grad_norm": 0.0804185451989367, + "language_loss": 1.06161952, + "learning_rate": 0.0009623665303297678, + "loss": 1.07427645, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.1796875, + "step": 129, + "time_per_iteration": 2.7088472843170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256284, + "balance_loss_mlp": 1.23668599, + "epoch": 0.025009619084263177, + "flos": 655350262272.0, + "grad_norm": 0.12369480901617341, + "language_loss": 1.10218048, + "learning_rate": 0.0009638956919697878, + "loss": 1.11474347, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.19592285, + "step": 130, + "time_per_iteration": 2.857571840286255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266475, + "balance_loss_mlp": 1.24420691, + "epoch": 0.02520200076952674, + "flos": 454187271168.0, + "grad_norm": 0.08293639348197612, + "language_loss": 1.02638018, + "learning_rate": 0.0009654131357809714, + "loss": 1.03904486, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.22253418, + "step": 131, + "time_per_iteration": 2.641470432281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128644, + "balance_loss_mlp": 1.26142943, + "epoch": 0.025394382454790303, + "flos": 839427397632.0, + "grad_norm": 0.05741461740254168, + "language_loss": 1.11002767, + "learning_rate": 0.0009669190399838441, + "loss": 1.12289214, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.25036621, + "step": 132, + "time_per_iteration": 3.133596420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01302533, + "balance_loss_mlp": 1.27633083, + "epoch": 0.025586764140053866, + "flos": 580725628416.0, + "grad_norm": 0.06987664196058198, + "language_loss": 1.0413487, + "learning_rate": 0.0009684135787636724, + "loss": 1.05437398, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.26208496, + "step": 133, + "time_per_iteration": 2.7968075275421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01325396, + "balance_loss_mlp": 1.29710746, + "epoch": 0.02577914582531743, + "flos": 789893959680.0, + "grad_norm": 0.07551411578012862, + "language_loss": 1.07757604, + "learning_rate": 0.0009698969223913726, + "loss": 1.09083009, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.28283691, + "step": 134, + "time_per_iteration": 3.0058987140655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131126, + "balance_loss_mlp": 1.28212547, + "epoch": 0.025971527510580992, + "flos": 594683320320.0, + "grad_norm": 0.0731546450398535, + "language_loss": 1.10457921, + "learning_rate": 0.0009713692373399265, + "loss": 1.11769176, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.29125977, + "step": 135, + "time_per_iteration": 2.6654229164123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02152319, + "balance_loss_mlp": 1.95700705, + "epoch": 0.026163909195844555, + "flos": 1576771522560.0, + "grad_norm": 0.26755932757436196, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81608546, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 1.953125, + "step": 136, + "time_per_iteration": 6.531313896179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01724331, + "balance_loss_mlp": 1.55266988, + "epoch": 0.026356290881108118, + "flos": 1501306030080.0, + "grad_norm": 0.1444935793983717, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79535371, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 1.71875, + "step": 137, + "time_per_iteration": 4.966995716094971 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01371776, + "balance_loss_mlp": 1.34284425, + "epoch": 0.02654867256637168, + "flos": 596841025536.0, + "grad_norm": 0.06823267419395149, + "language_loss": 1.03539467, + "learning_rate": 0.0009757216201974225, + "loss": 1.04911256, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.28918457, + "step": 138, + "time_per_iteration": 2.7901663780212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01396345, + "balance_loss_mlp": 1.36752045, + "epoch": 0.026741054251635244, + "flos": 544761386496.0, + "grad_norm": 0.08904352821745645, + "language_loss": 1.08793342, + "learning_rate": 0.0009771514130396581, + "loss": 1.10189688, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.28833008, + "step": 139, + "time_per_iteration": 2.664384603500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410566, + "balance_loss_mlp": 1.38171697, + "epoch": 0.026933435936898807, + "flos": 506591387136.0, + "grad_norm": 0.09467843708761726, + "language_loss": 1.08393478, + "learning_rate": 0.00097857095638274, + "loss": 1.09804034, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.28833008, + "step": 140, + "time_per_iteration": 2.5600626468658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01399161, + "balance_loss_mlp": 1.37263703, + "epoch": 0.02712581762216237, + "flos": 740513290752.0, + "grad_norm": 0.06303030428856128, + "language_loss": 0.99670362, + "learning_rate": 0.0009799803961288726, + "loss": 1.01069522, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.26538086, + "step": 141, + "time_per_iteration": 2.984253168106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01354082, + "balance_loss_mlp": 1.33143175, + "epoch": 0.027318199307425933, + "flos": 848023464960.0, + "grad_norm": 0.06264638149228761, + "language_loss": 1.05898559, + "learning_rate": 0.000981379875086876, + "loss": 1.07252645, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.22644043, + "step": 142, + "time_per_iteration": 3.032597064971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323808, + "balance_loss_mlp": 1.30553341, + "epoch": 0.027510580992689496, + "flos": 575288400384.0, + "grad_norm": 0.07028220907739285, + "language_loss": 1.01752293, + "learning_rate": 0.0009827695330590185, + "loss": 1.030761, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.18273926, + "step": 143, + "time_per_iteration": 2.626483678817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01303402, + "balance_loss_mlp": 1.28557992, + "epoch": 0.02770296267795306, + "flos": 772079164416.0, + "grad_norm": 0.05744811954937285, + "language_loss": 1.00619161, + "learning_rate": 0.0009841495069248256, + "loss": 1.0192256, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.17822266, + "step": 144, + "time_per_iteration": 2.9495198726654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316023, + "balance_loss_mlp": 1.29916632, + "epoch": 0.027895344363216622, + "flos": 569123642880.0, + "grad_norm": 0.04968902291069247, + "language_loss": 0.9920603, + "learning_rate": 0.0009855199307219871, + "loss": 1.00522041, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.1685791, + "step": 145, + "time_per_iteration": 2.6407721042633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130391, + "balance_loss_mlp": 1.28731608, + "epoch": 0.028087726048480186, + "flos": 547099564032.0, + "grad_norm": 0.10723696528856613, + "language_loss": 1.01566505, + "learning_rate": 0.0009868809357244854, + "loss": 1.02870417, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.16589355, + "step": 146, + "time_per_iteration": 2.6262452602386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287507, + "balance_loss_mlp": 1.27153277, + "epoch": 0.02828010773374375, + "flos": 524519663616.0, + "grad_norm": 0.06991830692152445, + "language_loss": 1.05632663, + "learning_rate": 0.0009882326505180556, + "loss": 1.06920183, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.15966797, + "step": 147, + "time_per_iteration": 2.6469435691833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270213, + "balance_loss_mlp": 1.2534517, + "epoch": 0.02847248941900731, + "flos": 772108277760.0, + "grad_norm": 0.07309095407736986, + "language_loss": 1.04486537, + "learning_rate": 0.0009895752010730906, + "loss": 1.0575676, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.16748047, + "step": 148, + "time_per_iteration": 2.9457786083221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012724, + "balance_loss_mlp": 1.25667655, + "epoch": 0.028664871104270875, + "flos": 534150208512.0, + "grad_norm": 0.048334696317449924, + "language_loss": 1.10088921, + "learning_rate": 0.0009909087108150867, + "loss": 1.11361325, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.15710449, + "step": 149, + "time_per_iteration": 2.712559700012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309133, + "balance_loss_mlp": 1.29286051, + "epoch": 0.028857252789534438, + "flos": 367557599232.0, + "grad_norm": 0.13115053493636905, + "language_loss": 1.11238122, + "learning_rate": 0.0009922333006927371, + "loss": 1.12547255, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.16247559, + "step": 150, + "time_per_iteration": 2.4607067108154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01329212, + "balance_loss_mlp": 1.31257081, + "epoch": 0.029049634474798, + "flos": 515232534528.0, + "grad_norm": 0.06948512606819708, + "language_loss": 1.04613614, + "learning_rate": 0.0009935490892437632, + "loss": 1.05942833, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.16650391, + "step": 151, + "time_per_iteration": 2.5460238456726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309109, + "balance_loss_mlp": 1.29317045, + "epoch": 0.029242016160061564, + "flos": 587840495616.0, + "grad_norm": 0.11257287432569656, + "language_loss": 1.03097093, + "learning_rate": 0.0009948561926585687, + "loss": 1.04406202, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.15930176, + "step": 152, + "time_per_iteration": 2.753009557723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300362, + "balance_loss_mlp": 1.28555596, + "epoch": 0.029434397845325123, + "flos": 551816616960.0, + "grad_norm": 0.062223246716750634, + "language_loss": 1.06524086, + "learning_rate": 0.0009961547248418122, + "loss": 1.07824445, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.14807129, + "step": 153, + "time_per_iteration": 2.630092144012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308008, + "balance_loss_mlp": 1.29357219, + "epoch": 0.029626779530588686, + "flos": 603221160960.0, + "grad_norm": 0.09420536563091944, + "language_loss": 1.03062868, + "learning_rate": 0.0009974447974719707, + "loss": 1.04370856, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.14440918, + "step": 154, + "time_per_iteration": 2.6962759494781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312448, + "balance_loss_mlp": 1.29745138, + "epoch": 0.02981916121585225, + "flos": 620808993792.0, + "grad_norm": 0.08558703297148447, + "language_loss": 1.04985213, + "learning_rate": 0.0009987265200589763, + "loss": 1.0629766, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.15002441, + "step": 155, + "time_per_iteration": 2.7059414386749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295882, + "balance_loss_mlp": 1.28057528, + "epoch": 0.030011542901115813, + "flos": 661322962944.0, + "grad_norm": 0.09731995783752632, + "language_loss": 1.04436159, + "learning_rate": 0.001, + "loss": 1.05732036, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.1529541, + "step": 156, + "time_per_iteration": 2.856968641281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262708, + "balance_loss_mlp": 1.24682927, + "epoch": 0.030203924586379376, + "flos": 651258842112.0, + "grad_norm": 0.05966927829613408, + "language_loss": 1.02520585, + "learning_rate": 0.0009999999029413921, + "loss": 1.03783274, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.15856934, + "step": 157, + "time_per_iteration": 2.851480722427368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268181, + "balance_loss_mlp": 1.25150406, + "epoch": 0.03039630627164294, + "flos": 531083091456.0, + "grad_norm": 0.1034311415514979, + "language_loss": 1.04085183, + "learning_rate": 0.0009999996117656068, + "loss": 1.05353379, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.16674805, + "step": 158, + "time_per_iteration": 2.707646369934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262524, + "balance_loss_mlp": 1.24747968, + "epoch": 0.030588687956906502, + "flos": 585914135040.0, + "grad_norm": 0.12050944658187299, + "language_loss": 0.97824669, + "learning_rate": 0.0009999991264727564, + "loss": 0.99087203, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.15039062, + "step": 159, + "time_per_iteration": 2.7575390338897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272116, + "balance_loss_mlp": 1.25716722, + "epoch": 0.030781069642170065, + "flos": 513026777088.0, + "grad_norm": 0.07020206521781955, + "language_loss": 1.08316755, + "learning_rate": 0.0009999984470630296, + "loss": 1.09588861, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.14929199, + "step": 160, + "time_per_iteration": 2.62310528755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128559, + "balance_loss_mlp": 1.27058172, + "epoch": 0.030973451327433628, + "flos": 717766064640.0, + "grad_norm": 0.06068839125924313, + "language_loss": 0.96528012, + "learning_rate": 0.0009999975735366902, + "loss": 0.978136, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.15002441, + "step": 161, + "time_per_iteration": 3.0823376178741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01305752, + "balance_loss_mlp": 1.29055238, + "epoch": 0.03116583301269719, + "flos": 1109312133120.0, + "grad_norm": 0.09428930343360856, + "language_loss": 0.98546314, + "learning_rate": 0.0009999965058940775, + "loss": 0.99852067, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.1517334, + "step": 162, + "time_per_iteration": 3.486618995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01315996, + "balance_loss_mlp": 1.3010118, + "epoch": 0.031358214697960754, + "flos": 450676403712.0, + "grad_norm": 0.09976775191278689, + "language_loss": 1.04580116, + "learning_rate": 0.0009999952441356057, + "loss": 1.05896115, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.1496582, + "step": 163, + "time_per_iteration": 2.537173271179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300744, + "balance_loss_mlp": 1.28654623, + "epoch": 0.031550596383224314, + "flos": 1254701325312.0, + "grad_norm": 0.0838197011845512, + "language_loss": 1.05903006, + "learning_rate": 0.000999993788261765, + "loss": 1.07203746, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.14196777, + "step": 164, + "time_per_iteration": 3.5638957023620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270584, + "balance_loss_mlp": 1.25625503, + "epoch": 0.03174297806848788, + "flos": 667841310720.0, + "grad_norm": 0.068717417443618, + "language_loss": 1.0642612, + "learning_rate": 0.00099999213827312, + "loss": 1.076967, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.14343262, + "step": 165, + "time_per_iteration": 2.8084213733673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255587, + "balance_loss_mlp": 1.24152076, + "epoch": 0.03193535975375144, + "flos": 551033832960.0, + "grad_norm": 0.06892139424853191, + "language_loss": 1.0208962, + "learning_rate": 0.000999990294170312, + "loss": 1.03345203, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.14074707, + "step": 166, + "time_per_iteration": 2.6247787475585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259954, + "balance_loss_mlp": 1.24549401, + "epoch": 0.032127741439015006, + "flos": 543377700864.0, + "grad_norm": 0.08292396830811857, + "language_loss": 1.05774951, + "learning_rate": 0.0009999882559540566, + "loss": 1.07034898, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.14465332, + "step": 167, + "time_per_iteration": 2.654036283493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291491, + "balance_loss_mlp": 1.27790117, + "epoch": 0.032320123124278566, + "flos": 548104928256.0, + "grad_norm": 0.07217909902530589, + "language_loss": 1.02104354, + "learning_rate": 0.000999986023625145, + "loss": 1.03395844, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.13598633, + "step": 168, + "time_per_iteration": 2.696866750717163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03738194, + "balance_loss_mlp": 3.61993837, + "epoch": 0.03251250480954213, + "flos": 1305156865536.0, + "grad_norm": 0.563981464368737, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.82662606, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 1.1796875, + "step": 169, + "time_per_iteration": 4.971506834030151 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134723, + "balance_loss_mlp": 1.33386648, + "epoch": 0.03270488649480569, + "flos": 560866609152.0, + "grad_norm": 0.12141219581883538, + "language_loss": 1.02540469, + "learning_rate": 0.0009999809766328958, + "loss": 1.03887701, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.13391113, + "step": 170, + "time_per_iteration": 2.646425724029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01355192, + "balance_loss_mlp": 1.34039843, + "epoch": 0.03289726818006926, + "flos": 482120031744.0, + "grad_norm": 0.08046017426621577, + "language_loss": 1.05186188, + "learning_rate": 0.0009999781619715177, + "loss": 1.06541371, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.14770508, + "step": 171, + "time_per_iteration": 2.535360336303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381569, + "balance_loss_mlp": 1.36640596, + "epoch": 0.03308964986533282, + "flos": 674355276288.0, + "grad_norm": 0.08789680193074563, + "language_loss": 1.04250002, + "learning_rate": 0.000999975153201402, + "loss": 1.05631578, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.15161133, + "step": 172, + "time_per_iteration": 2.8205513954162598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433883, + "balance_loss_mlp": 1.41711044, + "epoch": 0.033282031550596385, + "flos": 608937785856.0, + "grad_norm": 0.07610360898370483, + "language_loss": 1.02505267, + "learning_rate": 0.0009999719503237174, + "loss": 1.03939152, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.16760254, + "step": 173, + "time_per_iteration": 2.738676071166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01451195, + "balance_loss_mlp": 1.43315864, + "epoch": 0.033474413235859944, + "flos": 467801547264.0, + "grad_norm": 0.07270846083900323, + "language_loss": 1.111094, + "learning_rate": 0.0009999685533397073, + "loss": 1.12560594, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.18029785, + "step": 174, + "time_per_iteration": 2.5556905269622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01429898, + "balance_loss_mlp": 1.41368508, + "epoch": 0.03366679492112351, + "flos": 579365263872.0, + "grad_norm": 0.09196642879711979, + "language_loss": 1.03199494, + "learning_rate": 0.00099996496225069, + "loss": 1.04629397, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.16210938, + "step": 175, + "time_per_iteration": 2.6806485652923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01432234, + "balance_loss_mlp": 1.41513896, + "epoch": 0.03385917660638707, + "flos": 637378315776.0, + "grad_norm": 0.08705990667808558, + "language_loss": 1.05897307, + "learning_rate": 0.0009999611770580604, + "loss": 1.07329535, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.17102051, + "step": 176, + "time_per_iteration": 2.830826759338379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415158, + "balance_loss_mlp": 1.39910054, + "epoch": 0.03405155829165064, + "flos": 441587123712.0, + "grad_norm": 0.08054669051038237, + "language_loss": 1.03868258, + "learning_rate": 0.0009999571977632876, + "loss": 1.05283427, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.16052246, + "step": 177, + "time_per_iteration": 2.623309850692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01463141, + "balance_loss_mlp": 1.44573641, + "epoch": 0.034243939976914196, + "flos": 466097766912.0, + "grad_norm": 0.08089290506220445, + "language_loss": 1.06928194, + "learning_rate": 0.0009999530243679166, + "loss": 1.08391333, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.17407227, + "step": 178, + "time_per_iteration": 2.545133113861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01451423, + "balance_loss_mlp": 1.43560433, + "epoch": 0.03443632166217776, + "flos": 778919016960.0, + "grad_norm": 0.08468734735068614, + "language_loss": 1.01505899, + "learning_rate": 0.0009999486568735675, + "loss": 1.0295732, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.15808105, + "step": 179, + "time_per_iteration": 3.0384457111358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433641, + "balance_loss_mlp": 1.41778612, + "epoch": 0.03462870334744132, + "flos": 1263284246016.0, + "grad_norm": 0.06997324880309466, + "language_loss": 1.01388979, + "learning_rate": 0.0009999440952819362, + "loss": 1.02822614, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.15856934, + "step": 180, + "time_per_iteration": 3.6892786026000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01401308, + "balance_loss_mlp": 1.38610911, + "epoch": 0.03482108503270489, + "flos": 606899354112.0, + "grad_norm": 0.057831512038439566, + "language_loss": 1.02027512, + "learning_rate": 0.0009999393395947935, + "loss": 1.03428817, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.15185547, + "step": 181, + "time_per_iteration": 2.826353073120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381551, + "balance_loss_mlp": 1.36612535, + "epoch": 0.03501346671796845, + "flos": 538010284032.0, + "grad_norm": 0.05913415109875365, + "language_loss": 1.05361927, + "learning_rate": 0.0009999343898139858, + "loss": 1.06743479, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.1541748, + "step": 182, + "time_per_iteration": 2.593250036239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01359754, + "balance_loss_mlp": 1.33988214, + "epoch": 0.035205848403232015, + "flos": 518231250432.0, + "grad_norm": 0.05898920665253376, + "language_loss": 1.04308426, + "learning_rate": 0.0009999292459414348, + "loss": 1.05668187, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.19909668, + "step": 183, + "time_per_iteration": 2.565936326980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01311064, + "balance_loss_mlp": 1.296103, + "epoch": 0.035398230088495575, + "flos": 472134486528.0, + "grad_norm": 0.06373248491183749, + "language_loss": 1.08499169, + "learning_rate": 0.0009999239079791374, + "loss": 1.09810233, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.14953613, + "step": 184, + "time_per_iteration": 2.552949905395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130912, + "balance_loss_mlp": 1.29237127, + "epoch": 0.03559061177375914, + "flos": 511820591616.0, + "grad_norm": 0.056329932736213485, + "language_loss": 1.01337337, + "learning_rate": 0.0009999183759291659, + "loss": 1.02646446, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.16748047, + "step": 185, + "time_per_iteration": 2.741727113723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291511, + "balance_loss_mlp": 1.27575147, + "epoch": 0.0357829934590227, + "flos": 477146903040.0, + "grad_norm": 0.11224085577532149, + "language_loss": 1.03738213, + "learning_rate": 0.0009999126497936682, + "loss": 1.05029726, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.1574707, + "step": 186, + "time_per_iteration": 2.4901957511901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291515, + "balance_loss_mlp": 1.27446783, + "epoch": 0.03597537514428627, + "flos": 644350588416.0, + "grad_norm": 0.06537030709871235, + "language_loss": 1.06735992, + "learning_rate": 0.0009999067295748676, + "loss": 1.08027506, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.1706543, + "step": 187, + "time_per_iteration": 2.7923052310943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01327575, + "balance_loss_mlp": 1.30966997, + "epoch": 0.03616775682954983, + "flos": 580916275200.0, + "grad_norm": 0.06523062893181024, + "language_loss": 1.04418302, + "learning_rate": 0.000999900615275062, + "loss": 1.05745876, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.17919922, + "step": 188, + "time_per_iteration": 2.7248637676239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295421, + "balance_loss_mlp": 1.27722955, + "epoch": 0.03636013851481339, + "flos": 382210735104.0, + "grad_norm": 0.08035209765807474, + "language_loss": 1.10347509, + "learning_rate": 0.0009998943068966256, + "loss": 1.11642933, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.18188477, + "step": 189, + "time_per_iteration": 2.429497480392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279097, + "balance_loss_mlp": 1.26120377, + "epoch": 0.03655252020007695, + "flos": 582954706944.0, + "grad_norm": 0.07380481555246936, + "language_loss": 1.0506779, + "learning_rate": 0.0009998878044420072, + "loss": 1.06346881, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.17907715, + "step": 190, + "time_per_iteration": 2.6878626346588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012863, + "balance_loss_mlp": 1.26773953, + "epoch": 0.03674490188534051, + "flos": 471376433664.0, + "grad_norm": 0.10484442400689244, + "language_loss": 1.01223493, + "learning_rate": 0.0009998811079137318, + "loss": 1.02509785, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.18566895, + "step": 191, + "time_per_iteration": 2.561494827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281775, + "balance_loss_mlp": 1.26645625, + "epoch": 0.03693728357060408, + "flos": 528113488896.0, + "grad_norm": 0.0609431296621874, + "language_loss": 1.01984763, + "learning_rate": 0.0009998742173143987, + "loss": 1.03266537, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.1529541, + "step": 192, + "time_per_iteration": 2.59798264503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336751, + "balance_loss_mlp": 1.32157528, + "epoch": 0.03712966525586764, + "flos": 798657352704.0, + "grad_norm": 0.10186248006293357, + "language_loss": 1.02005363, + "learning_rate": 0.0009998671326466833, + "loss": 1.03342128, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.15185547, + "step": 193, + "time_per_iteration": 2.9510865211486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01331057, + "balance_loss_mlp": 1.3157624, + "epoch": 0.037322046941131205, + "flos": 829628116992.0, + "grad_norm": 0.06375125184008373, + "language_loss": 1.02914846, + "learning_rate": 0.0009998598539133362, + "loss": 1.04245901, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.1529541, + "step": 194, + "time_per_iteration": 2.9981300830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01337882, + "balance_loss_mlp": 1.3235296, + "epoch": 0.037514428626394765, + "flos": 437460797952.0, + "grad_norm": 0.10181133305516413, + "language_loss": 1.03936744, + "learning_rate": 0.0009998523811171828, + "loss": 1.0527463, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.14379883, + "step": 195, + "time_per_iteration": 2.501542568206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296215, + "balance_loss_mlp": 1.28125429, + "epoch": 0.03770681031165833, + "flos": 511372459008.0, + "grad_norm": 0.09414845611868274, + "language_loss": 1.04584992, + "learning_rate": 0.0009998447142611248, + "loss": 1.05881214, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.14941406, + "step": 196, + "time_per_iteration": 2.6247317790985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128144, + "balance_loss_mlp": 1.26702762, + "epoch": 0.03789919199692189, + "flos": 807102061056.0, + "grad_norm": 0.05831249070889761, + "language_loss": 0.97701526, + "learning_rate": 0.0009998368533481387, + "loss": 0.9898296, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.14422607, + "step": 197, + "time_per_iteration": 3.01912784576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294999, + "balance_loss_mlp": 1.27945375, + "epoch": 0.03809157368218546, + "flos": 690274234368.0, + "grad_norm": 0.06656848410147823, + "language_loss": 1.00630498, + "learning_rate": 0.0009998287983812762, + "loss": 1.01925504, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.15551758, + "step": 198, + "time_per_iteration": 2.8252804279327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0135387, + "balance_loss_mlp": 1.33592904, + "epoch": 0.03828395536744902, + "flos": 517675428864.0, + "grad_norm": 0.06988401379713739, + "language_loss": 1.06386423, + "learning_rate": 0.0009998205493636646, + "loss": 1.07740283, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.17944336, + "step": 199, + "time_per_iteration": 2.649765729904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01339461, + "balance_loss_mlp": 1.32242572, + "epoch": 0.038476337052712584, + "flos": 581389138944.0, + "grad_norm": 0.07184113921580974, + "language_loss": 0.9925406, + "learning_rate": 0.0009998121062985063, + "loss": 1.00593519, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.17053223, + "step": 200, + "time_per_iteration": 2.6788320541381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328142, + "balance_loss_mlp": 1.31167912, + "epoch": 0.03866871873797614, + "flos": 576791359488.0, + "grad_norm": 0.059667024197710104, + "language_loss": 1.01260698, + "learning_rate": 0.0009998034691890794, + "loss": 1.02588844, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.16455078, + "step": 201, + "time_per_iteration": 2.753265380859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297644, + "balance_loss_mlp": 1.28249288, + "epoch": 0.03886110042323971, + "flos": 540472117248.0, + "grad_norm": 0.07302515973387386, + "language_loss": 1.05948424, + "learning_rate": 0.0009997946380387369, + "loss": 1.07246065, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.15136719, + "step": 202, + "time_per_iteration": 2.618546485900879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262023, + "balance_loss_mlp": 1.24746776, + "epoch": 0.03905348210850327, + "flos": 717694843392.0, + "grad_norm": 0.0775452329378228, + "language_loss": 1.08266401, + "learning_rate": 0.0009997856128509076, + "loss": 1.09528422, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.14550781, + "step": 203, + "time_per_iteration": 2.8284859657287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267878, + "balance_loss_mlp": 1.25321579, + "epoch": 0.039245863793766836, + "flos": 427268639232.0, + "grad_norm": 0.06664318613050589, + "language_loss": 1.02886617, + "learning_rate": 0.0009997763936290952, + "loss": 1.04154491, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.14660645, + "step": 204, + "time_per_iteration": 2.516263246536255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0129892, + "balance_loss_mlp": 1.28264785, + "epoch": 0.039438245479030395, + "flos": 662804163072.0, + "grad_norm": 0.07463685050771204, + "language_loss": 1.0815413, + "learning_rate": 0.0009997669803768789, + "loss": 1.09453046, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.16271973, + "step": 205, + "time_per_iteration": 2.7576606273651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291456, + "balance_loss_mlp": 1.27812803, + "epoch": 0.03963062716429396, + "flos": 635063459328.0, + "grad_norm": 0.055878982250893716, + "language_loss": 1.03253651, + "learning_rate": 0.0009997573730979134, + "loss": 1.04545116, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.13342285, + "step": 206, + "time_per_iteration": 2.716325521469116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.07279634, + "balance_loss_mlp": 4.65512276, + "epoch": 0.03982300884955752, + "flos": 1417813286400.0, + "grad_norm": 0.533603848118922, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.86472833, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 26.25, + "step": 207, + "time_per_iteration": 4.635821342468262 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0137482, + "balance_loss_mlp": 1.35964513, + "epoch": 0.04001539053482109, + "flos": 688769713152.0, + "grad_norm": 0.1040721574676452, + "language_loss": 1.02094078, + "learning_rate": 0.0009997375764747294, + "loss": 1.03468895, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.1517334, + "step": 208, + "time_per_iteration": 2.974442481994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415285, + "balance_loss_mlp": 1.40052748, + "epoch": 0.04020777222008465, + "flos": 533363042304.0, + "grad_norm": 0.08111266742266361, + "language_loss": 0.99458027, + "learning_rate": 0.0009997273871381967, + "loss": 1.00873303, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.14758301, + "step": 209, + "time_per_iteration": 2.6802144050598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466201, + "balance_loss_mlp": 1.44989347, + "epoch": 0.040400153905348214, + "flos": 567661381632.0, + "grad_norm": 0.06875741115436663, + "language_loss": 1.05031717, + "learning_rate": 0.0009997170037902862, + "loss": 1.0649792, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.16308594, + "step": 210, + "time_per_iteration": 2.6975836753845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01531768, + "balance_loss_mlp": 1.51399446, + "epoch": 0.040592535590611774, + "flos": 713130559488.0, + "grad_norm": 0.07197690318934227, + "language_loss": 1.07202697, + "learning_rate": 0.0009997064264350292, + "loss": 1.08734465, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.17785645, + "step": 211, + "time_per_iteration": 2.836771011352539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01531925, + "balance_loss_mlp": 1.5154984, + "epoch": 0.04078491727587533, + "flos": 577824427008.0, + "grad_norm": 0.09120436996840299, + "language_loss": 1.0146966, + "learning_rate": 0.0009996956550765317, + "loss": 1.03001595, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.16430664, + "step": 212, + "time_per_iteration": 2.671292781829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01499293, + "balance_loss_mlp": 1.4817214, + "epoch": 0.0409772989611389, + "flos": 552033404928.0, + "grad_norm": 0.11449485477945152, + "language_loss": 0.96278083, + "learning_rate": 0.0009996846897189762, + "loss": 0.97777379, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.17565918, + "step": 213, + "time_per_iteration": 2.6231424808502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.014653, + "balance_loss_mlp": 1.44753814, + "epoch": 0.04116968064640246, + "flos": 555347833344.0, + "grad_norm": 0.09512793115916172, + "language_loss": 1.02356708, + "learning_rate": 0.0009996735303666193, + "loss": 1.03822017, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.1776123, + "step": 214, + "time_per_iteration": 2.6930177211761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01477298, + "balance_loss_mlp": 1.46134758, + "epoch": 0.041362062331666026, + "flos": 578204158464.0, + "grad_norm": 0.09141123477091552, + "language_loss": 1.04750729, + "learning_rate": 0.0009996621770237937, + "loss": 1.06228042, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.15942383, + "step": 215, + "time_per_iteration": 2.7448923587799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01578462, + "balance_loss_mlp": 1.56013966, + "epoch": 0.041554444016929586, + "flos": 611130396672.0, + "grad_norm": 0.10233552268903827, + "language_loss": 0.99822551, + "learning_rate": 0.0009996506296949073, + "loss": 1.01401007, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.18334961, + "step": 216, + "time_per_iteration": 2.8548526763916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01609008, + "balance_loss_mlp": 1.59156775, + "epoch": 0.04174682570219315, + "flos": 527857413120.0, + "grad_norm": 0.10522858499680945, + "language_loss": 0.99888742, + "learning_rate": 0.0009996388883844428, + "loss": 1.01497757, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.17456055, + "step": 217, + "time_per_iteration": 2.618546724319458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01557164, + "balance_loss_mlp": 1.54124999, + "epoch": 0.04193920738745671, + "flos": 511258977792.0, + "grad_norm": 0.09341741551851517, + "language_loss": 1.03841758, + "learning_rate": 0.0009996269530969588, + "loss": 1.05398929, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.15905762, + "step": 218, + "time_per_iteration": 2.6204636096954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01525903, + "balance_loss_mlp": 1.50927377, + "epoch": 0.04213158907272028, + "flos": 571226093568.0, + "grad_norm": 0.09609660813155754, + "language_loss": 1.02944803, + "learning_rate": 0.0009996148238370888, + "loss": 1.04470706, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.16625977, + "step": 219, + "time_per_iteration": 2.7071943283081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0150071, + "balance_loss_mlp": 1.48340106, + "epoch": 0.04232397075798384, + "flos": 963803667456.0, + "grad_norm": 0.05454565212769997, + "language_loss": 0.9941752, + "learning_rate": 0.0009996025006095421, + "loss": 1.00918233, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.1730957, + "step": 220, + "time_per_iteration": 3.3006374835968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.10944285, + "balance_loss_mlp": 6.84272289, + "epoch": 0.042516352443247404, + "flos": 1468814777856.0, + "grad_norm": 0.48497418398004566, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.88727427, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 41.0, + "step": 221, + "time_per_iteration": 5.7136383056640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01601015, + "balance_loss_mlp": 1.58291924, + "epoch": 0.042708734128510964, + "flos": 654419091456.0, + "grad_norm": 0.10763646442297387, + "language_loss": 0.99765503, + "learning_rate": 0.0009995772722706307, + "loss": 1.0136652, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.1809082, + "step": 222, + "time_per_iteration": 2.8322792053222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01658843, + "balance_loss_mlp": 1.63811278, + "epoch": 0.04290111581377453, + "flos": 431601578496.0, + "grad_norm": 0.16393394652444138, + "language_loss": 1.13557565, + "learning_rate": 0.0009995643671690604, + "loss": 1.1521641, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.20739746, + "step": 223, + "time_per_iteration": 2.470729351043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0163871, + "balance_loss_mlp": 1.61504686, + "epoch": 0.04309349749903809, + "flos": 644379701760.0, + "grad_norm": 0.08733094203359489, + "language_loss": 1.00837708, + "learning_rate": 0.0009995512681194023, + "loss": 1.02476418, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.23632812, + "step": 224, + "time_per_iteration": 2.8274452686309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01615568, + "balance_loss_mlp": 1.58755326, + "epoch": 0.04328587918430166, + "flos": 830861853696.0, + "grad_norm": 0.12001676841435771, + "language_loss": 0.98664522, + "learning_rate": 0.0009995379751267417, + "loss": 1.00280082, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.28027344, + "step": 225, + "time_per_iteration": 3.275660991668701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01617416, + "balance_loss_mlp": 1.58639741, + "epoch": 0.043478260869565216, + "flos": 524804852736.0, + "grad_norm": 0.1467276253632499, + "language_loss": 1.0007726, + "learning_rate": 0.0009995244881962398, + "loss": 1.01694679, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.30981445, + "step": 226, + "time_per_iteration": 2.6300203800201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01601732, + "balance_loss_mlp": 1.56787658, + "epoch": 0.04367064255482878, + "flos": 439253328384.0, + "grad_norm": 0.095918638324787, + "language_loss": 1.01389623, + "learning_rate": 0.0009995108073331323, + "loss": 1.02991343, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.33862305, + "step": 227, + "time_per_iteration": 2.667628765106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0158134, + "balance_loss_mlp": 1.5462923, + "epoch": 0.04386302424009234, + "flos": 507109330944.0, + "grad_norm": 0.08564981186298011, + "language_loss": 1.04024279, + "learning_rate": 0.0009994969325427309, + "loss": 1.05605614, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.35058594, + "step": 228, + "time_per_iteration": 2.6454501152038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558795, + "balance_loss_mlp": 1.52224541, + "epoch": 0.04405540592535591, + "flos": 540432829440.0, + "grad_norm": 0.07744391701114339, + "language_loss": 1.00052619, + "learning_rate": 0.0009994828638304218, + "loss": 1.016114, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.36547852, + "step": 229, + "time_per_iteration": 2.6468071937561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01516137, + "balance_loss_mlp": 1.47794271, + "epoch": 0.04424778761061947, + "flos": 446136850944.0, + "grad_norm": 0.08052263902742013, + "language_loss": 1.06763554, + "learning_rate": 0.0009994686012016675, + "loss": 1.08279693, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.3815918, + "step": 230, + "time_per_iteration": 2.5467634201049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01483037, + "balance_loss_mlp": 1.44515228, + "epoch": 0.044440169295883035, + "flos": 700383435264.0, + "grad_norm": 0.05918307184238542, + "language_loss": 1.0518043, + "learning_rate": 0.000999454144662005, + "loss": 1.06663465, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.37866211, + "step": 231, + "time_per_iteration": 2.8704099655151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01473358, + "balance_loss_mlp": 1.43549716, + "epoch": 0.044632550981146595, + "flos": 588055873536.0, + "grad_norm": 0.08626514264815018, + "language_loss": 0.99676436, + "learning_rate": 0.0009994394942170468, + "loss": 1.01149797, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.37866211, + "step": 232, + "time_per_iteration": 2.6578898429870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01461415, + "balance_loss_mlp": 1.4258194, + "epoch": 0.04482493266641016, + "flos": 554534525952.0, + "grad_norm": 0.07124765242066121, + "language_loss": 0.96965969, + "learning_rate": 0.0009994246498724808, + "loss": 0.98427379, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.35620117, + "step": 233, + "time_per_iteration": 2.7764015197753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01463645, + "balance_loss_mlp": 1.42790616, + "epoch": 0.04501731435167372, + "flos": 722500646400.0, + "grad_norm": 0.07759597622956232, + "language_loss": 0.99069166, + "learning_rate": 0.00099940961163407, + "loss": 1.00532806, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.35766602, + "step": 234, + "time_per_iteration": 2.8431143760681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454599, + "balance_loss_mlp": 1.42098188, + "epoch": 0.04520969603693728, + "flos": 511539784704.0, + "grad_norm": 0.05931413709293958, + "language_loss": 1.02564597, + "learning_rate": 0.0009993943795076528, + "loss": 1.04019189, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.33642578, + "step": 235, + "time_per_iteration": 2.645988702774048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01444244, + "balance_loss_mlp": 1.40936303, + "epoch": 0.04540207772220085, + "flos": 364854246912.0, + "grad_norm": 0.07280953320994132, + "language_loss": 1.04776168, + "learning_rate": 0.0009993789534991427, + "loss": 1.062204, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.34912109, + "step": 236, + "time_per_iteration": 2.4084837436676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01418385, + "balance_loss_mlp": 1.38390946, + "epoch": 0.045594459407464406, + "flos": 522407038464.0, + "grad_norm": 0.060943880380569936, + "language_loss": 0.99500269, + "learning_rate": 0.0009993633336145287, + "loss": 1.00918651, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.34472656, + "step": 237, + "time_per_iteration": 2.6044533252716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406135, + "balance_loss_mlp": 1.3730185, + "epoch": 0.04578684109272797, + "flos": 671442338304.0, + "grad_norm": 0.06747057459653658, + "language_loss": 1.03573179, + "learning_rate": 0.0009993475198598752, + "loss": 1.04979324, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.33129883, + "step": 238, + "time_per_iteration": 2.9948084354400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01383668, + "balance_loss_mlp": 1.35164809, + "epoch": 0.04597922277799153, + "flos": 541387321344.0, + "grad_norm": 0.07135856148897902, + "language_loss": 0.99909985, + "learning_rate": 0.0009993315122413212, + "loss": 1.01293659, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.32006836, + "step": 239, + "time_per_iteration": 2.5848827362060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369111, + "balance_loss_mlp": 1.33773541, + "epoch": 0.0461716044632551, + "flos": 458732616192.0, + "grad_norm": 0.056000088810755834, + "language_loss": 1.0008105, + "learning_rate": 0.0009993153107650818, + "loss": 1.01450157, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.31347656, + "step": 240, + "time_per_iteration": 2.5492687225341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338815, + "balance_loss_mlp": 1.31015706, + "epoch": 0.04636398614851866, + "flos": 455009342976.0, + "grad_norm": 0.06491754001609312, + "language_loss": 0.99534512, + "learning_rate": 0.0009992989154374468, + "loss": 1.00873327, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.28662109, + "step": 241, + "time_per_iteration": 2.511237621307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294622, + "balance_loss_mlp": 1.26833653, + "epoch": 0.046556367833782225, + "flos": 556558401024.0, + "grad_norm": 0.07592069792168304, + "language_loss": 1.06626534, + "learning_rate": 0.0009992823262647817, + "loss": 1.07921147, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.26293945, + "step": 242, + "time_per_iteration": 2.7618701457977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249282, + "balance_loss_mlp": 1.22240043, + "epoch": 0.046748749519045785, + "flos": 592625949696.0, + "grad_norm": 0.0687662987323222, + "language_loss": 1.00893593, + "learning_rate": 0.0009992655432535264, + "loss": 1.0214287, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.26879883, + "step": 243, + "time_per_iteration": 2.7471935749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255015, + "balance_loss_mlp": 1.23083937, + "epoch": 0.04694113120430935, + "flos": 569596506624.0, + "grad_norm": 0.07373455055845594, + "language_loss": 1.0054853, + "learning_rate": 0.0009992485664101973, + "loss": 1.01803541, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.24169922, + "step": 244, + "time_per_iteration": 2.635344982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295554, + "balance_loss_mlp": 1.27291572, + "epoch": 0.04713351288957291, + "flos": 863401158144.0, + "grad_norm": 0.10584905626659928, + "language_loss": 1.03312445, + "learning_rate": 0.000999231395741385, + "loss": 1.04607987, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.22631836, + "step": 245, + "time_per_iteration": 3.093386173248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128706, + "balance_loss_mlp": 1.26464868, + "epoch": 0.04732589457483648, + "flos": 536961249792.0, + "grad_norm": 0.08844420521863233, + "language_loss": 1.01371169, + "learning_rate": 0.0009992140312537557, + "loss": 1.02658224, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.22412109, + "step": 246, + "time_per_iteration": 2.667579412460327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256359, + "balance_loss_mlp": 1.23515141, + "epoch": 0.04751827626010004, + "flos": 761566910976.0, + "grad_norm": 0.052835972446563725, + "language_loss": 0.9609164, + "learning_rate": 0.000999196472954051, + "loss": 0.97347999, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.2121582, + "step": 247, + "time_per_iteration": 2.9537084102630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02369813, + "balance_loss_mlp": 2.16687083, + "epoch": 0.0477106579453636, + "flos": 1578961313280.0, + "grad_norm": 0.2151482568863758, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.81794667, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 2.03125, + "step": 248, + "time_per_iteration": 5.758621454238892 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289622, + "balance_loss_mlp": 1.27137113, + "epoch": 0.04790303963062716, + "flos": 457535195136.0, + "grad_norm": 0.10849969336884063, + "language_loss": 1.03316629, + "learning_rate": 0.0009991607749457578, + "loss": 1.04606247, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.18261719, + "step": 249, + "time_per_iteration": 2.5432913303375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334566, + "balance_loss_mlp": 1.31724536, + "epoch": 0.04809542131589073, + "flos": 782079266304.0, + "grad_norm": 0.08264534697846654, + "language_loss": 1.01180637, + "learning_rate": 0.0009991426352510286, + "loss": 1.02515209, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.17321777, + "step": 250, + "time_per_iteration": 3.1542766094207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01351096, + "balance_loss_mlp": 1.33215368, + "epoch": 0.04828780300115429, + "flos": 558995503104.0, + "grad_norm": 0.06435857362074206, + "language_loss": 1.03307557, + "learning_rate": 0.0009991243017719422, + "loss": 1.04658651, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.18933105, + "step": 251, + "time_per_iteration": 2.693882942199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333217, + "balance_loss_mlp": 1.31485844, + "epoch": 0.048480184686417856, + "flos": 501682277376.0, + "grad_norm": 0.09276508096019526, + "language_loss": 0.97794825, + "learning_rate": 0.0009991057745156165, + "loss": 0.99128038, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.18347168, + "step": 252, + "time_per_iteration": 2.628873109817505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01867297, + "balance_loss_mlp": 1.75514495, + "epoch": 0.048672566371681415, + "flos": 1535585430528.0, + "grad_norm": 0.16359674361847032, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.8377828, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 1.125, + "step": 253, + "time_per_iteration": 5.060615062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285031, + "balance_loss_mlp": 1.26567185, + "epoch": 0.04886494805694498, + "flos": 537665458176.0, + "grad_norm": 0.07164286827098729, + "language_loss": 1.06546187, + "learning_rate": 0.0009990681387000943, + "loss": 1.07831216, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.19384766, + "step": 254, + "time_per_iteration": 2.783367395401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275754, + "balance_loss_mlp": 1.25606036, + "epoch": 0.04905732974220854, + "flos": 679841966592.0, + "grad_norm": 0.06618046133348403, + "language_loss": 1.01404011, + "learning_rate": 0.0009990490301555093, + "loss": 1.02679765, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.19689941, + "step": 255, + "time_per_iteration": 2.9520761966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01936632, + "balance_loss_mlp": 1.86796737, + "epoch": 0.04924971142747211, + "flos": 1420408949760.0, + "grad_norm": 0.31562964738653715, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.81151783, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 0.6875, + "step": 256, + "time_per_iteration": 4.825209856033325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01615246, + "balance_loss_mlp": 1.55344784, + "epoch": 0.04944209311273567, + "flos": 1557202074624.0, + "grad_norm": 0.16937574338078817, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.80857986, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.6171875, + "step": 257, + "time_per_iteration": 4.995501518249512 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0163115, + "balance_loss_mlp": 1.58422887, + "epoch": 0.04963447479799923, + "flos": 1569985514496.0, + "grad_norm": 0.13524925240989144, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71607035, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.46875, + "step": 258, + "time_per_iteration": 4.841471910476685 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272668, + "balance_loss_mlp": 1.24463022, + "epoch": 0.049826856483262794, + "flos": 625063357440.0, + "grad_norm": 0.06365504505971183, + "language_loss": 0.95603192, + "learning_rate": 0.0009989706585723202, + "loss": 0.96875864, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.28076172, + "step": 259, + "time_per_iteration": 2.7635786533355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130022, + "balance_loss_mlp": 1.27020288, + "epoch": 0.05001923816852635, + "flos": 503912765952.0, + "grad_norm": 0.062257698278494894, + "language_loss": 1.01846027, + "learning_rate": 0.0009989505813633442, + "loss": 1.03146255, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.29980469, + "step": 260, + "time_per_iteration": 2.6451833248138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131611, + "balance_loss_mlp": 1.28101516, + "epoch": 0.05021161985378992, + "flos": 587066476032.0, + "grad_norm": 0.06290514068599455, + "language_loss": 1.01911807, + "learning_rate": 0.000998930310444573, + "loss": 1.03227913, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.35083008, + "step": 261, + "time_per_iteration": 2.6989662647247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324978, + "balance_loss_mlp": 1.2880708, + "epoch": 0.05040400153905348, + "flos": 633029409792.0, + "grad_norm": 0.0625839964239575, + "language_loss": 1.00387836, + "learning_rate": 0.0009989098458238765, + "loss": 1.01712811, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.36914062, + "step": 262, + "time_per_iteration": 2.7581043243408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319841, + "balance_loss_mlp": 1.28395867, + "epoch": 0.050596383224317046, + "flos": 553344307200.0, + "grad_norm": 0.06067150197267865, + "language_loss": 0.99905968, + "learning_rate": 0.0009988891875091998, + "loss": 1.01225805, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.35913086, + "step": 263, + "time_per_iteration": 2.7601842880249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131413, + "balance_loss_mlp": 1.27793837, + "epoch": 0.050788764909580605, + "flos": 549389689344.0, + "grad_norm": 0.07440292928735547, + "language_loss": 0.94277728, + "learning_rate": 0.0009988683355085636, + "loss": 0.95591855, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.36206055, + "step": 264, + "time_per_iteration": 2.7262909412384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277315, + "balance_loss_mlp": 1.24248254, + "epoch": 0.05098114659484417, + "flos": 604812870144.0, + "grad_norm": 0.06984595792035174, + "language_loss": 1.02861905, + "learning_rate": 0.000998847289830063, + "loss": 1.04139221, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.34838867, + "step": 265, + "time_per_iteration": 2.8318397998809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256298, + "balance_loss_mlp": 1.22272849, + "epoch": 0.05117352828010773, + "flos": 438317775360.0, + "grad_norm": 0.08677906198544101, + "language_loss": 0.95779377, + "learning_rate": 0.0009988260504818682, + "loss": 0.9703567, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.3359375, + "step": 266, + "time_per_iteration": 2.5212388038635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220367, + "balance_loss_mlp": 1.19046903, + "epoch": 0.0513659099653713, + "flos": 504784300032.0, + "grad_norm": 0.09456939977029206, + "language_loss": 1.01958096, + "learning_rate": 0.000998804617472226, + "loss": 1.03178465, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.29858398, + "step": 267, + "time_per_iteration": 2.649739980697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169139, + "balance_loss_mlp": 1.14131606, + "epoch": 0.05155829165063486, + "flos": 695183344128.0, + "grad_norm": 0.07125411147685125, + "language_loss": 0.97574937, + "learning_rate": 0.0009987829908094568, + "loss": 0.98744082, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.27856445, + "step": 268, + "time_per_iteration": 2.816098690032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119703, + "balance_loss_mlp": 1.09379935, + "epoch": 0.051750673335898424, + "flos": 1347751830528.0, + "grad_norm": 0.06583247177587333, + "language_loss": 1.04151332, + "learning_rate": 0.0009987611705019569, + "loss": 1.05271029, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.25927734, + "step": 269, + "time_per_iteration": 4.478148460388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103433, + "balance_loss_mlp": 1.08008027, + "epoch": 0.051943055021161984, + "flos": 489362936832.0, + "grad_norm": 0.06787757239342199, + "language_loss": 1.02481639, + "learning_rate": 0.0009987391565581978, + "loss": 1.03585076, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.23364258, + "step": 270, + "time_per_iteration": 2.5662009716033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111004, + "balance_loss_mlp": 1.08859241, + "epoch": 0.05213543670642555, + "flos": 545504882688.0, + "grad_norm": 0.08198896814085149, + "language_loss": 0.9504528, + "learning_rate": 0.000998716948986726, + "loss": 0.96156287, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.22424316, + "step": 271, + "time_per_iteration": 2.7815349102020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158552, + "balance_loss_mlp": 1.13697529, + "epoch": 0.05232781839168911, + "flos": 603285179904.0, + "grad_norm": 0.07646156534985457, + "language_loss": 0.97641921, + "learning_rate": 0.0009986945477961633, + "loss": 0.9880048, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.21569824, + "step": 272, + "time_per_iteration": 2.694547414779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188724, + "balance_loss_mlp": 1.16735017, + "epoch": 0.052520200076952676, + "flos": 538218307584.0, + "grad_norm": 0.07381807258867126, + "language_loss": 1.02498066, + "learning_rate": 0.0009986719529952066, + "loss": 1.03686786, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.21386719, + "step": 273, + "time_per_iteration": 2.8339192867279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121752, + "balance_loss_mlp": 1.19785035, + "epoch": 0.052712581762216236, + "flos": 463148513280.0, + "grad_norm": 0.0738352941440963, + "language_loss": 1.01808548, + "learning_rate": 0.000998649164592628, + "loss": 1.03026068, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.19677734, + "step": 274, + "time_per_iteration": 2.60577130317688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236713, + "balance_loss_mlp": 1.21763909, + "epoch": 0.0529049634474798, + "flos": 547749927936.0, + "grad_norm": 0.08134169766286939, + "language_loss": 0.99272913, + "learning_rate": 0.0009986261825972748, + "loss": 1.00509632, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.19055176, + "step": 275, + "time_per_iteration": 2.652561664581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196689, + "balance_loss_mlp": 1.17834246, + "epoch": 0.05309734513274336, + "flos": 617727320064.0, + "grad_norm": 0.09111845604121613, + "language_loss": 1.01860571, + "learning_rate": 0.000998603007018069, + "loss": 1.03057253, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.18334961, + "step": 276, + "time_per_iteration": 2.8293774127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011443, + "balance_loss_mlp": 1.1273365, + "epoch": 0.05328972681800693, + "flos": 605220304896.0, + "grad_norm": 0.07377841396756965, + "language_loss": 0.99345076, + "learning_rate": 0.0009985796378640089, + "loss": 1.00489378, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.16955566, + "step": 277, + "time_per_iteration": 2.694716215133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098185, + "balance_loss_mlp": 1.08067346, + "epoch": 0.05348210850327049, + "flos": 604197411840.0, + "grad_norm": 0.07074934963985437, + "language_loss": 0.99532163, + "learning_rate": 0.0009985560751441665, + "loss": 1.00630355, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.1751709, + "step": 278, + "time_per_iteration": 2.798563241958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095446, + "balance_loss_mlp": 1.07736206, + "epoch": 0.053674490188534055, + "flos": 630480236544.0, + "grad_norm": 0.054749659326078955, + "language_loss": 1.01733184, + "learning_rate": 0.00099853231886769, + "loss": 1.02828622, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.1809082, + "step": 279, + "time_per_iteration": 2.780940532684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134885, + "balance_loss_mlp": 1.11744475, + "epoch": 0.053866871873797614, + "flos": 478939433472.0, + "grad_norm": 0.06375435082524677, + "language_loss": 1.01461124, + "learning_rate": 0.0009985083690438024, + "loss": 1.02595997, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.17443848, + "step": 280, + "time_per_iteration": 2.68762469291687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145965, + "balance_loss_mlp": 1.12913251, + "epoch": 0.054059253559061174, + "flos": 787673645568.0, + "grad_norm": 0.07384801764192533, + "language_loss": 0.92380941, + "learning_rate": 0.0009984842256818016, + "loss": 0.93526906, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.16845703, + "step": 281, + "time_per_iteration": 3.054032325744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114791, + "balance_loss_mlp": 1.13080359, + "epoch": 0.05425163524432474, + "flos": 628076630016.0, + "grad_norm": 0.082175996598207, + "language_loss": 1.0314945, + "learning_rate": 0.0009984598887910613, + "loss": 1.04297376, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.17114258, + "step": 282, + "time_per_iteration": 2.7095611095428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144379, + "balance_loss_mlp": 1.12627149, + "epoch": 0.0544440169295883, + "flos": 615453161472.0, + "grad_norm": 0.06813866095032944, + "language_loss": 0.9902432, + "learning_rate": 0.0009984353583810297, + "loss": 1.00168693, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.18103027, + "step": 283, + "time_per_iteration": 2.804438829421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124262, + "balance_loss_mlp": 1.10624945, + "epoch": 0.05463639861485187, + "flos": 647471549952.0, + "grad_norm": 0.10003204141391345, + "language_loss": 1.01340103, + "learning_rate": 0.0009984106344612302, + "loss": 1.02464366, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.18017578, + "step": 284, + "time_per_iteration": 2.7521376609802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109552, + "balance_loss_mlp": 1.07819879, + "epoch": 0.054828780300115426, + "flos": 796845883392.0, + "grad_norm": 0.07143310654982075, + "language_loss": 0.96421391, + "learning_rate": 0.0009983857170412615, + "loss": 0.97516906, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.17321777, + "step": 285, + "time_per_iteration": 2.9796621799468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089942, + "balance_loss_mlp": 1.07363439, + "epoch": 0.05502116198537899, + "flos": 549414420480.0, + "grad_norm": 0.05224422052371224, + "language_loss": 0.95713383, + "learning_rate": 0.000998360606130798, + "loss": 0.96803325, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.16308594, + "step": 286, + "time_per_iteration": 2.7950801849365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02908189, + "balance_loss_mlp": 2.83799911, + "epoch": 0.05521354367064255, + "flos": 1406967791616.0, + "grad_norm": 0.233188183772104, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.71981305, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 0.703125, + "step": 287, + "time_per_iteration": 4.876653432846069 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179574, + "balance_loss_mlp": 1.1627655, + "epoch": 0.05540592535590612, + "flos": 645123197952.0, + "grad_norm": 0.17417830683261867, + "language_loss": 1.0204829, + "learning_rate": 0.0009983098038774552, + "loss": 1.03227878, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.16821289, + "step": 288, + "time_per_iteration": 2.7781550884246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02101154, + "balance_loss_mlp": 2.07540464, + "epoch": 0.05559830704116968, + "flos": 1510293413376.0, + "grad_norm": 0.1730100464590254, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.80271375, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 0.2578125, + "step": 289, + "time_per_iteration": 4.801970481872559 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338926, + "balance_loss_mlp": 1.32332134, + "epoch": 0.055790688726433245, + "flos": 508078379520.0, + "grad_norm": 0.11288123874753296, + "language_loss": 0.99586821, + "learning_rate": 0.0009982582277800948, + "loss": 1.00925756, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.15588379, + "step": 290, + "time_per_iteration": 2.6019012928009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01376714, + "balance_loss_mlp": 1.36076403, + "epoch": 0.055983070411696804, + "flos": 657570576384.0, + "grad_norm": 0.11158393407579077, + "language_loss": 1.06464982, + "learning_rate": 0.0009982321495648908, + "loss": 1.07841706, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.15942383, + "step": 291, + "time_per_iteration": 2.7833075523376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281101, + "balance_loss_mlp": 1.26441216, + "epoch": 0.05617545209696037, + "flos": 587051919360.0, + "grad_norm": 0.091490024999748, + "language_loss": 0.97375935, + "learning_rate": 0.0009982058779188115, + "loss": 0.98657036, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.16699219, + "step": 292, + "time_per_iteration": 2.700998067855835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223751, + "balance_loss_mlp": 1.20634639, + "epoch": 0.05636783378222393, + "flos": 611331217920.0, + "grad_norm": 0.09093545163733599, + "language_loss": 1.05090272, + "learning_rate": 0.0009981794128520567, + "loss": 1.06314015, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.17431641, + "step": 293, + "time_per_iteration": 2.769562244415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172918, + "balance_loss_mlp": 1.15501237, + "epoch": 0.0565602154674875, + "flos": 667847102976.0, + "grad_norm": 0.08200667246549262, + "language_loss": 1.02219713, + "learning_rate": 0.000998152754374901, + "loss": 1.03392649, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.17919922, + "step": 294, + "time_per_iteration": 2.8421483039855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140496, + "balance_loss_mlp": 1.12121987, + "epoch": 0.05675259715275106, + "flos": 616963474944.0, + "grad_norm": 0.06298459153201627, + "language_loss": 0.97706711, + "learning_rate": 0.0009981259024976943, + "loss": 0.98847204, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.19250488, + "step": 295, + "time_per_iteration": 2.709536552429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131247, + "balance_loss_mlp": 1.11139894, + "epoch": 0.05694497883801462, + "flos": 751424214528.0, + "grad_norm": 0.13011693222478776, + "language_loss": 0.96307456, + "learning_rate": 0.0009980988572308612, + "loss": 0.97438705, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.19848633, + "step": 296, + "time_per_iteration": 2.9606993198394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125313, + "balance_loss_mlp": 1.10492802, + "epoch": 0.05713736052327818, + "flos": 711669708288.0, + "grad_norm": 0.06808560063607492, + "language_loss": 0.9959082, + "learning_rate": 0.0009980716185849015, + "loss": 1.00716126, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.20385742, + "step": 297, + "time_per_iteration": 2.952467203140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133548, + "balance_loss_mlp": 1.11424804, + "epoch": 0.05732974220854175, + "flos": 468737100288.0, + "grad_norm": 0.05570922928007862, + "language_loss": 0.95103967, + "learning_rate": 0.0009980441865703904, + "loss": 0.9623751, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.19299316, + "step": 298, + "time_per_iteration": 2.6629996299743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125947, + "balance_loss_mlp": 1.10630131, + "epoch": 0.05752212389380531, + "flos": 601143441408.0, + "grad_norm": 0.06175770353433084, + "language_loss": 1.038656, + "learning_rate": 0.000998016561197978, + "loss": 1.04991555, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.19628906, + "step": 299, + "time_per_iteration": 2.7027034759521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122899, + "balance_loss_mlp": 1.10499382, + "epoch": 0.057714505579068875, + "flos": 678344799744.0, + "grad_norm": 0.07709513760197055, + "language_loss": 0.95715761, + "learning_rate": 0.0009979887424783895, + "loss": 0.96838653, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.17907715, + "step": 300, + "time_per_iteration": 2.8467562198638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122592, + "balance_loss_mlp": 1.10369694, + "epoch": 0.057906887264332435, + "flos": 595604316672.0, + "grad_norm": 0.05754387138467597, + "language_loss": 0.94804943, + "learning_rate": 0.0009979607304224248, + "loss": 0.95927536, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.18908691, + "step": 301, + "time_per_iteration": 2.7457566261291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135958, + "balance_loss_mlp": 1.11577594, + "epoch": 0.058099268949596, + "flos": 551855904768.0, + "grad_norm": 0.06951393564289957, + "language_loss": 1.02452385, + "learning_rate": 0.000997932525040959, + "loss": 1.03588343, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.20166016, + "step": 302, + "time_per_iteration": 2.670464038848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123814, + "balance_loss_mlp": 1.10513425, + "epoch": 0.05829165063485956, + "flos": 507906671616.0, + "grad_norm": 0.06408930588753382, + "language_loss": 1.04041958, + "learning_rate": 0.000997904126344943, + "loss": 1.05165768, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.18676758, + "step": 303, + "time_per_iteration": 2.654275417327881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122557, + "balance_loss_mlp": 1.10432982, + "epoch": 0.05848403232012313, + "flos": 614949774336.0, + "grad_norm": 0.10902949066110783, + "language_loss": 1.00108004, + "learning_rate": 0.0009978755343454018, + "loss": 1.0123055, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.18212891, + "step": 304, + "time_per_iteration": 2.7061922550201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118016, + "balance_loss_mlp": 1.10034943, + "epoch": 0.05867641400538669, + "flos": 499835902464.0, + "grad_norm": 0.07196511907519268, + "language_loss": 1.01183403, + "learning_rate": 0.0009978467490534355, + "loss": 1.02301419, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.17663574, + "step": 305, + "time_per_iteration": 2.5658843517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118418, + "balance_loss_mlp": 1.09971452, + "epoch": 0.05886879569065025, + "flos": 531019072512.0, + "grad_norm": 0.05577021807863236, + "language_loss": 0.98775607, + "learning_rate": 0.00099781777048022, + "loss": 0.99894023, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.18713379, + "step": 306, + "time_per_iteration": 2.688661813735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112614, + "balance_loss_mlp": 1.10866416, + "epoch": 0.05906117737591381, + "flos": 488811497472.0, + "grad_norm": 0.06489613907432343, + "language_loss": 0.99682212, + "learning_rate": 0.0009977885986370057, + "loss": 1.00808358, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.17480469, + "step": 307, + "time_per_iteration": 2.527008056640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129188, + "balance_loss_mlp": 1.11242771, + "epoch": 0.05925355906117737, + "flos": 591213150720.0, + "grad_norm": 0.060579194597163814, + "language_loss": 0.94911426, + "learning_rate": 0.000997759233535118, + "loss": 0.96040612, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.16772461, + "step": 308, + "time_per_iteration": 2.768683433532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108276, + "balance_loss_mlp": 1.09052539, + "epoch": 0.05944594074644094, + "flos": 563373522432.0, + "grad_norm": 0.074144120767366, + "language_loss": 1.01706028, + "learning_rate": 0.0009977296751859576, + "loss": 1.02814317, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.17749023, + "step": 309, + "time_per_iteration": 2.710550308227539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109964, + "balance_loss_mlp": 1.0817585, + "epoch": 0.0596383224317045, + "flos": 538483147776.0, + "grad_norm": 0.1012520362466171, + "language_loss": 1.03562367, + "learning_rate": 0.0009976999236009998, + "loss": 1.04662001, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.17895508, + "step": 310, + "time_per_iteration": 2.7346065044403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095396, + "balance_loss_mlp": 1.07697809, + "epoch": 0.059830704116968066, + "flos": 560684726784.0, + "grad_norm": 0.05903807060939984, + "language_loss": 1.05193245, + "learning_rate": 0.0009976699787917955, + "loss": 1.06288636, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.18408203, + "step": 311, + "time_per_iteration": 2.737165689468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04018029, + "balance_loss_mlp": 3.94440532, + "epoch": 0.060023085802231625, + "flos": 1569759962112.0, + "grad_norm": 0.34396821433057967, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.77461016, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.734375, + "step": 312, + "time_per_iteration": 4.990010976791382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130575, + "balance_loss_mlp": 1.11010623, + "epoch": 0.06021546748749519, + "flos": 482415395328.0, + "grad_norm": 0.18656347991450223, + "language_loss": 0.97164261, + "learning_rate": 0.0009976095095472243, + "loss": 0.98294836, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.20458984, + "step": 313, + "time_per_iteration": 2.5596373081207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143867, + "balance_loss_mlp": 1.12198031, + "epoch": 0.06040784917275875, + "flos": 619889407488.0, + "grad_norm": 0.10017394493353984, + "language_loss": 0.98154747, + "learning_rate": 0.0009975789851353334, + "loss": 0.9929862, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.21911621, + "step": 314, + "time_per_iteration": 2.783092498779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113993, + "balance_loss_mlp": 1.11832976, + "epoch": 0.06060023085802232, + "flos": 483292721664.0, + "grad_norm": 0.12837029886330253, + "language_loss": 1.00706339, + "learning_rate": 0.0009975482675461487, + "loss": 1.01846266, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.21594238, + "step": 315, + "time_per_iteration": 2.6375765800476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128184, + "balance_loss_mlp": 1.10697675, + "epoch": 0.06079261254328588, + "flos": 581620483584.0, + "grad_norm": 0.07139597701291463, + "language_loss": 0.9800331, + "learning_rate": 0.0009975173567915952, + "loss": 0.99131489, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.21228027, + "step": 316, + "time_per_iteration": 2.680223226547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116284, + "balance_loss_mlp": 1.09438515, + "epoch": 0.060984994228549444, + "flos": 687492306432.0, + "grad_norm": 0.12898022133672052, + "language_loss": 0.92624593, + "learning_rate": 0.000997486252883674, + "loss": 0.93740869, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.21887207, + "step": 317, + "time_per_iteration": 2.835162878036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104615, + "balance_loss_mlp": 1.08325243, + "epoch": 0.061177375913813004, + "flos": 1314284327424.0, + "grad_norm": 0.06442728945451602, + "language_loss": 0.97186124, + "learning_rate": 0.0009974549558344602, + "loss": 0.98290741, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.21350098, + "step": 318, + "time_per_iteration": 3.6293551921844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105129, + "balance_loss_mlp": 1.08439815, + "epoch": 0.06136975759907657, + "flos": 574072040448.0, + "grad_norm": 0.08131052095693254, + "language_loss": 1.07145, + "learning_rate": 0.000997423465656105, + "loss": 1.08250129, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.20715332, + "step": 319, + "time_per_iteration": 2.7070071697235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101786, + "balance_loss_mlp": 1.08168781, + "epoch": 0.06156213928434013, + "flos": 527281242624.0, + "grad_norm": 0.059301156484267634, + "language_loss": 1.04424822, + "learning_rate": 0.0009973917823608335, + "loss": 1.0552659, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.20092773, + "step": 320, + "time_per_iteration": 2.6225128173828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110531, + "balance_loss_mlp": 1.0897882, + "epoch": 0.061754520969603696, + "flos": 495238123008.0, + "grad_norm": 0.05387649814829365, + "language_loss": 0.98383266, + "learning_rate": 0.0009973599059609462, + "loss": 0.9949379, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.20739746, + "step": 321, + "time_per_iteration": 2.692152261734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107693, + "balance_loss_mlp": 1.08798778, + "epoch": 0.061946902654867256, + "flos": 439839673344.0, + "grad_norm": 0.06112812680296507, + "language_loss": 0.9711749, + "learning_rate": 0.000997327836468819, + "loss": 0.98225188, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.19702148, + "step": 322, + "time_per_iteration": 2.5772383213043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110285, + "balance_loss_mlp": 1.0900557, + "epoch": 0.06213928434013082, + "flos": 598490961408.0, + "grad_norm": 0.0645434874295678, + "language_loss": 0.9942351, + "learning_rate": 0.000997295573896902, + "loss": 1.00533807, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.20239258, + "step": 323, + "time_per_iteration": 2.839282274246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02259253, + "balance_loss_mlp": 2.20088792, + "epoch": 0.06233166602539438, + "flos": 1449393716736.0, + "grad_norm": 0.19547826226404627, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.83455294, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.58203125, + "step": 324, + "time_per_iteration": 4.67440938949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01888161, + "balance_loss_mlp": 1.83246601, + "epoch": 0.06252404771065795, + "flos": 1462504453632.0, + "grad_norm": 0.11962022052509429, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80460101, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.55859375, + "step": 325, + "time_per_iteration": 4.860283136367798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177486, + "balance_loss_mlp": 1.15595722, + "epoch": 0.06271642939592151, + "flos": 464059335168.0, + "grad_norm": 0.06272096910143152, + "language_loss": 0.93621421, + "learning_rate": 0.000997197627828043, + "loss": 0.94798911, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.2154541, + "step": 326, + "time_per_iteration": 2.5594961643218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205877, + "balance_loss_mlp": 1.18165386, + "epoch": 0.06290881108118507, + "flos": 532111776768.0, + "grad_norm": 0.08849931028565244, + "language_loss": 0.89414704, + "learning_rate": 0.0009971645930629716, + "loss": 0.90620589, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.2421875, + "step": 327, + "time_per_iteration": 2.7163310050964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223238, + "balance_loss_mlp": 1.19748878, + "epoch": 0.06310119276644863, + "flos": 673262572032.0, + "grad_norm": 0.09892100413683627, + "language_loss": 1.02883804, + "learning_rate": 0.0009971313652814872, + "loss": 1.04107046, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.25769043, + "step": 328, + "time_per_iteration": 2.7786266803741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228803, + "balance_loss_mlp": 1.20175433, + "epoch": 0.0632935744517122, + "flos": 770404497408.0, + "grad_norm": 0.06852265531332852, + "language_loss": 0.99799907, + "learning_rate": 0.0009970979444964903, + "loss": 1.01028717, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.27050781, + "step": 329, + "time_per_iteration": 2.952498197555542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235649, + "balance_loss_mlp": 1.2062993, + "epoch": 0.06348595613697576, + "flos": 561649393152.0, + "grad_norm": 0.09680127661829774, + "language_loss": 1.0121367, + "learning_rate": 0.0009970643307209556, + "loss": 1.02449322, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.29296875, + "step": 330, + "time_per_iteration": 2.78190541267395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240935, + "balance_loss_mlp": 1.20970178, + "epoch": 0.06367833782223932, + "flos": 675891730944.0, + "grad_norm": 0.08786526055569537, + "language_loss": 0.9788332, + "learning_rate": 0.0009970305239679334, + "loss": 0.99124253, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.31201172, + "step": 331, + "time_per_iteration": 2.805845022201538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228576, + "balance_loss_mlp": 1.19891691, + "epoch": 0.06387071950750288, + "flos": 495035891712.0, + "grad_norm": 0.10390832636325384, + "language_loss": 1.03124022, + "learning_rate": 0.0009969965242505483, + "loss": 1.04352593, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.29614258, + "step": 332, + "time_per_iteration": 2.676711082458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207199, + "balance_loss_mlp": 1.1777302, + "epoch": 0.06406310119276645, + "flos": 533170985472.0, + "grad_norm": 0.07105898063788767, + "language_loss": 0.98331362, + "learning_rate": 0.0009969623315820007, + "loss": 0.99538565, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.29418945, + "step": 333, + "time_per_iteration": 2.6556739807128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118815, + "balance_loss_mlp": 1.16106582, + "epoch": 0.06425548287803001, + "flos": 455940513792.0, + "grad_norm": 0.08067516684621483, + "language_loss": 0.99160993, + "learning_rate": 0.000996927945975565, + "loss": 1.0034914, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.27124023, + "step": 334, + "time_per_iteration": 2.5398526191711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147495, + "balance_loss_mlp": 1.1214596, + "epoch": 0.06444786456329357, + "flos": 559817574912.0, + "grad_norm": 0.08169715789363684, + "language_loss": 0.96174645, + "learning_rate": 0.0009968933674445906, + "loss": 0.97322142, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.26062012, + "step": 335, + "time_per_iteration": 2.6592860221862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112932, + "balance_loss_mlp": 1.0879097, + "epoch": 0.06464024624855713, + "flos": 665769383424.0, + "grad_norm": 0.07104021966044574, + "language_loss": 0.97756392, + "learning_rate": 0.0009968585960025028, + "loss": 0.98869324, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.25036621, + "step": 336, + "time_per_iteration": 2.9279658794403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01860024, + "balance_loss_mlp": 1.84323907, + "epoch": 0.0648326279338207, + "flos": 1520578704384.0, + "grad_norm": 0.14426901756633248, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.7951321, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.16796875, + "step": 337, + "time_per_iteration": 4.810914993286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101739, + "balance_loss_mlp": 1.07948256, + "epoch": 0.06502500961908426, + "flos": 1142872768512.0, + "grad_norm": 0.058812216055980165, + "language_loss": 0.95864177, + "learning_rate": 0.0009967884744390583, + "loss": 0.96965921, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.22265625, + "step": 338, + "time_per_iteration": 3.512282371520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146504, + "balance_loss_mlp": 1.12267399, + "epoch": 0.06521739130434782, + "flos": 582339248640.0, + "grad_norm": 0.10793578588091769, + "language_loss": 0.97449529, + "learning_rate": 0.0009967531243449256, + "loss": 0.98596036, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.23828125, + "step": 339, + "time_per_iteration": 2.712907075881958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154087, + "balance_loss_mlp": 1.12950587, + "epoch": 0.06540977298961138, + "flos": 497398800384.0, + "grad_norm": 0.06396927661276222, + "language_loss": 1.04641414, + "learning_rate": 0.000996717581394126, + "loss": 1.05795503, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.24584961, + "step": 340, + "time_per_iteration": 2.5783133506774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168509, + "balance_loss_mlp": 1.14584756, + "epoch": 0.06560215467487496, + "flos": 542613855744.0, + "grad_norm": 0.07568553531769329, + "language_loss": 1.05092287, + "learning_rate": 0.000996681845600459, + "loss": 1.062608, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.2265625, + "step": 341, + "time_per_iteration": 2.6543757915496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115357, + "balance_loss_mlp": 1.13118291, + "epoch": 0.06579453636013852, + "flos": 413230961664.0, + "grad_norm": 0.06593832485574395, + "language_loss": 0.97027373, + "learning_rate": 0.0009966459169777982, + "loss": 0.9818095, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.22387695, + "step": 342, + "time_per_iteration": 2.5120761394500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141132, + "balance_loss_mlp": 1.11848283, + "epoch": 0.06598691804540208, + "flos": 560354457600.0, + "grad_norm": 0.055115078659976495, + "language_loss": 1.05281377, + "learning_rate": 0.0009966097955400924, + "loss": 1.0642252, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.22644043, + "step": 343, + "time_per_iteration": 2.6954751014709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111133, + "balance_loss_mlp": 1.08904982, + "epoch": 0.06617929973066564, + "flos": 571789117440.0, + "grad_norm": 0.06176008438986438, + "language_loss": 0.99064481, + "learning_rate": 0.0009965734813013652, + "loss": 1.00175822, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.22277832, + "step": 344, + "time_per_iteration": 2.8235929012298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090293, + "balance_loss_mlp": 1.06726193, + "epoch": 0.06637168141592921, + "flos": 490234470912.0, + "grad_norm": 0.05365164831273283, + "language_loss": 1.01308548, + "learning_rate": 0.0009965369742757151, + "loss": 1.02398837, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.23022461, + "step": 345, + "time_per_iteration": 2.5708556175231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078086, + "balance_loss_mlp": 1.05727243, + "epoch": 0.06656406310119277, + "flos": 1078735656960.0, + "grad_norm": 0.04968829319439664, + "language_loss": 0.97902787, + "learning_rate": 0.0009965002744773152, + "loss": 0.98980874, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.20812988, + "step": 346, + "time_per_iteration": 3.4984121322631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086945, + "balance_loss_mlp": 1.06450987, + "epoch": 0.06675644478645633, + "flos": 513421065216.0, + "grad_norm": 0.06258978415695335, + "language_loss": 0.95138037, + "learning_rate": 0.0009964633819204139, + "loss": 0.96224982, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.22436523, + "step": 347, + "time_per_iteration": 2.6866109371185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01752926, + "balance_loss_mlp": 1.73108697, + "epoch": 0.06694882647171989, + "flos": 1446359943168.0, + "grad_norm": 0.11694655230354783, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83554041, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.21875, + "step": 348, + "time_per_iteration": 4.935550928115845 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01583198, + "balance_loss_mlp": 1.56116796, + "epoch": 0.06714120815698346, + "flos": 1551230784000.0, + "grad_norm": 0.0989027294649474, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76737082, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.22070312, + "step": 349, + "time_per_iteration": 4.891008615493774 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149858, + "balance_loss_mlp": 1.12826955, + "epoch": 0.06733358984224702, + "flos": 879689673216.0, + "grad_norm": 0.07075764146586616, + "language_loss": 0.94920838, + "learning_rate": 0.000996351547842304, + "loss": 0.96070701, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.21582031, + "step": 350, + "time_per_iteration": 3.156322717666626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192552, + "balance_loss_mlp": 1.17055774, + "epoch": 0.06752597152751058, + "flos": 518654651904.0, + "grad_norm": 0.09040238598346795, + "language_loss": 0.93423587, + "learning_rate": 0.0009963138843953744, + "loss": 0.94616139, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.2199707, + "step": 351, + "time_per_iteration": 2.610987663269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206077, + "balance_loss_mlp": 1.18405879, + "epoch": 0.06771835321277414, + "flos": 539366266368.0, + "grad_norm": 0.08658544591035036, + "language_loss": 0.97852194, + "learning_rate": 0.000996276028262306, + "loss": 0.9905827, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.22021484, + "step": 352, + "time_per_iteration": 2.8413686752319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166048, + "balance_loss_mlp": 1.14382768, + "epoch": 0.0679107348980377, + "flos": 460430604288.0, + "grad_norm": 0.09117082479319542, + "language_loss": 1.04269946, + "learning_rate": 0.0009962379794577964, + "loss": 1.05435991, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.22216797, + "step": 353, + "time_per_iteration": 2.591372489929199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114388, + "balance_loss_mlp": 1.12227976, + "epoch": 0.06810311658330127, + "flos": 635601752064.0, + "grad_norm": 0.05781909345233015, + "language_loss": 0.94169199, + "learning_rate": 0.000996199737996617, + "loss": 0.95313084, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.21630859, + "step": 354, + "time_per_iteration": 2.9088492393493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125411, + "balance_loss_mlp": 1.10420346, + "epoch": 0.06829549826856483, + "flos": 464443448832.0, + "grad_norm": 0.06770201052263504, + "language_loss": 1.03043509, + "learning_rate": 0.0009961613038936149, + "loss": 1.04168916, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.2121582, + "step": 355, + "time_per_iteration": 2.571904420852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110613, + "balance_loss_mlp": 1.08917904, + "epoch": 0.06848787995382839, + "flos": 634335929856.0, + "grad_norm": 0.06097004840688574, + "language_loss": 0.95565176, + "learning_rate": 0.000996122677163711, + "loss": 0.96675789, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.21435547, + "step": 356, + "time_per_iteration": 2.794982671737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107296, + "balance_loss_mlp": 1.08667266, + "epoch": 0.06868026163909195, + "flos": 806023913472.0, + "grad_norm": 0.08020973782133771, + "language_loss": 1.01095176, + "learning_rate": 0.000996083857821902, + "loss": 1.02202487, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.20629883, + "step": 357, + "time_per_iteration": 3.007086753845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101637, + "balance_loss_mlp": 1.08076346, + "epoch": 0.06887264332435553, + "flos": 438997252608.0, + "grad_norm": 0.08125476198078858, + "language_loss": 0.99797714, + "learning_rate": 0.0009960448458832588, + "loss": 1.00899351, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.2088623, + "step": 358, + "time_per_iteration": 2.699530601501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098146, + "balance_loss_mlp": 1.07872701, + "epoch": 0.06906502500961909, + "flos": 484513463808.0, + "grad_norm": 0.06827746260367892, + "language_loss": 0.99188638, + "learning_rate": 0.000996005641362927, + "loss": 1.00286782, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.1940918, + "step": 359, + "time_per_iteration": 2.5541014671325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103345, + "balance_loss_mlp": 1.0841639, + "epoch": 0.06925740669488265, + "flos": 733293706752.0, + "grad_norm": 0.08731085845928575, + "language_loss": 1.02303529, + "learning_rate": 0.0009959662442761274, + "loss": 1.0340687, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.19189453, + "step": 360, + "time_per_iteration": 2.906623363494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093844, + "balance_loss_mlp": 1.07268476, + "epoch": 0.0694497883801462, + "flos": 552127947264.0, + "grad_norm": 0.06697663210144707, + "language_loss": 0.9595629, + "learning_rate": 0.000995926654638155, + "loss": 0.97050136, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.21179199, + "step": 361, + "time_per_iteration": 2.793663501739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082773, + "balance_loss_mlp": 1.06236482, + "epoch": 0.06964217006540978, + "flos": 677708992512.0, + "grad_norm": 0.06860924301964295, + "language_loss": 0.98198265, + "learning_rate": 0.00099588687246438, + "loss": 0.99281037, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.20410156, + "step": 362, + "time_per_iteration": 2.828139305114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085274, + "balance_loss_mlp": 1.06330371, + "epoch": 0.06983455175067334, + "flos": 523987163136.0, + "grad_norm": 0.08747541291209461, + "language_loss": 1.04803789, + "learning_rate": 0.0009958468977702471, + "loss": 1.0588907, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.21972656, + "step": 363, + "time_per_iteration": 2.5759966373443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02224374, + "balance_loss_mlp": 2.20682669, + "epoch": 0.0700269334359369, + "flos": 1575943658496.0, + "grad_norm": 0.2746548069890379, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81959081, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.17578125, + "step": 364, + "time_per_iteration": 4.782835245132446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134514, + "balance_loss_mlp": 1.11340213, + "epoch": 0.07021931512120046, + "flos": 1012848274944.0, + "grad_norm": 0.08586169827549085, + "language_loss": 0.93286598, + "learning_rate": 0.0009957663708830612, + "loss": 0.94421113, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.21105957, + "step": 365, + "time_per_iteration": 3.2484283447265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116189, + "balance_loss_mlp": 1.13884652, + "epoch": 0.07041169680646403, + "flos": 822622348800.0, + "grad_norm": 0.09941073368395695, + "language_loss": 0.97043455, + "learning_rate": 0.0009957258187212714, + "loss": 0.98205346, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.23034668, + "step": 366, + "time_per_iteration": 3.009479522705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01756688, + "balance_loss_mlp": 1.7255981, + "epoch": 0.07060407849172759, + "flos": 1413670993920.0, + "grad_norm": 0.12374795181042475, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80951542, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.31054688, + "step": 367, + "time_per_iteration": 4.82874608039856 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152073, + "balance_loss_mlp": 1.13087749, + "epoch": 0.07079646017699115, + "flos": 512652837888.0, + "grad_norm": 0.06786716904588838, + "language_loss": 0.93450886, + "learning_rate": 0.0009956441370400167, + "loss": 0.94602954, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.21191406, + "step": 368, + "time_per_iteration": 2.6226603984832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153965, + "balance_loss_mlp": 1.13158989, + "epoch": 0.07098884186225471, + "flos": 540240772608.0, + "grad_norm": 0.08343626294497461, + "language_loss": 0.99467343, + "learning_rate": 0.0009956030075522636, + "loss": 1.00621307, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.22375488, + "step": 369, + "time_per_iteration": 2.7128794193267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142856, + "balance_loss_mlp": 1.12137485, + "epoch": 0.07118122354751828, + "flos": 548419230720.0, + "grad_norm": 0.07464528715750075, + "language_loss": 0.98955953, + "learning_rate": 0.0009955616856543587, + "loss": 1.00098813, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.21472168, + "step": 370, + "time_per_iteration": 2.613138198852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118555, + "balance_loss_mlp": 1.0958215, + "epoch": 0.07137360523278184, + "flos": 620612554752.0, + "grad_norm": 0.056434914921328155, + "language_loss": 0.91880834, + "learning_rate": 0.0009955201713623448, + "loss": 0.92999387, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.22717285, + "step": 371, + "time_per_iteration": 2.747133255004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01746336, + "balance_loss_mlp": 1.72154021, + "epoch": 0.0715659869180454, + "flos": 1501850115072.0, + "grad_norm": 0.08669176596007007, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.78419054, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.24707031, + "step": 372, + "time_per_iteration": 4.931428670883179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102855, + "balance_loss_mlp": 1.08040774, + "epoch": 0.07175836860330896, + "flos": 495246887424.0, + "grad_norm": 0.07044890130803105, + "language_loss": 1.05121827, + "learning_rate": 0.0009954365656605333, + "loss": 1.06224692, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.22436523, + "step": 373, + "time_per_iteration": 2.550243616104126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118244, + "balance_loss_mlp": 1.09438992, + "epoch": 0.07195075028857253, + "flos": 785387902464.0, + "grad_norm": 0.05415547127036835, + "language_loss": 0.98150015, + "learning_rate": 0.0009953944742831947, + "loss": 0.99268264, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.23864746, + "step": 374, + "time_per_iteration": 2.9659459590911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125507, + "balance_loss_mlp": 1.10202336, + "epoch": 0.0721431319738361, + "flos": 592799067648.0, + "grad_norm": 0.07003669353380264, + "language_loss": 1.01441097, + "learning_rate": 0.0009953521905766642, + "loss": 1.02566612, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.23486328, + "step": 375, + "time_per_iteration": 2.942763566970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117119, + "balance_loss_mlp": 1.09393334, + "epoch": 0.07233551365909965, + "flos": 547981272576.0, + "grad_norm": 0.06343477824222313, + "language_loss": 0.99901861, + "learning_rate": 0.0009953097145573577, + "loss": 1.01018989, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.23193359, + "step": 376, + "time_per_iteration": 2.6275272369384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113711, + "balance_loss_mlp": 1.09023869, + "epoch": 0.07252789534436321, + "flos": 957170428416.0, + "grad_norm": 0.0678891965164594, + "language_loss": 0.97798675, + "learning_rate": 0.000995267046241766, + "loss": 0.98912394, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.23474121, + "step": 377, + "time_per_iteration": 3.2014975547790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096997, + "balance_loss_mlp": 1.07496762, + "epoch": 0.07272027702962677, + "flos": 507398902272.0, + "grad_norm": 0.0806519998399971, + "language_loss": 0.97275257, + "learning_rate": 0.0009952241856464547, + "loss": 0.98372257, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.22045898, + "step": 378, + "time_per_iteration": 2.6189732551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109641, + "balance_loss_mlp": 1.0746069, + "epoch": 0.07291265871489035, + "flos": 612128558592.0, + "grad_norm": 0.0691049335661606, + "language_loss": 1.04592681, + "learning_rate": 0.0009951811327880632, + "loss": 1.05689096, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.21826172, + "step": 379, + "time_per_iteration": 2.7411558628082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092071, + "balance_loss_mlp": 1.07025611, + "epoch": 0.0731050404001539, + "flos": 495502963200.0, + "grad_norm": 0.05765504670581196, + "language_loss": 0.97682816, + "learning_rate": 0.0009951378876833063, + "loss": 0.98774892, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.21813965, + "step": 380, + "time_per_iteration": 2.6211278438568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081575, + "balance_loss_mlp": 1.06068945, + "epoch": 0.07329742208541747, + "flos": 639677205504.0, + "grad_norm": 0.06809750593205881, + "language_loss": 1.04190159, + "learning_rate": 0.0009950944503489736, + "loss": 1.05271733, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.20898438, + "step": 381, + "time_per_iteration": 2.7533762454986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081401, + "balance_loss_mlp": 1.0607307, + "epoch": 0.07348980377068103, + "flos": 815999284224.0, + "grad_norm": 0.06607035824886899, + "language_loss": 0.98459697, + "learning_rate": 0.0009950508208019285, + "loss": 0.99541104, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.20678711, + "step": 382, + "time_per_iteration": 2.9885637760162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073667, + "balance_loss_mlp": 1.05369973, + "epoch": 0.0736821854559446, + "flos": 508383917568.0, + "grad_norm": 0.05970909775769663, + "language_loss": 1.02745128, + "learning_rate": 0.0009950069990591096, + "loss": 1.03818798, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.19958496, + "step": 383, + "time_per_iteration": 2.6111788749694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01835936, + "balance_loss_mlp": 1.8101871, + "epoch": 0.07387456714120816, + "flos": 1553801716224.0, + "grad_norm": 0.167122487372618, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.78237301, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.2578125, + "step": 384, + "time_per_iteration": 4.859915494918823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116619, + "balance_loss_mlp": 1.09575748, + "epoch": 0.07406694882647172, + "flos": 525219489792.0, + "grad_norm": 0.0799084124695288, + "language_loss": 0.96017051, + "learning_rate": 0.0009949187790542777, + "loss": 0.97133672, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.20861816, + "step": 385, + "time_per_iteration": 2.6976191997528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124239, + "balance_loss_mlp": 1.10322285, + "epoch": 0.07425933051173528, + "flos": 497468611584.0, + "grad_norm": 0.08753491640442414, + "language_loss": 0.91745877, + "learning_rate": 0.0009948743808265148, + "loss": 0.92870116, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.21020508, + "step": 386, + "time_per_iteration": 2.6870572566986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113476, + "balance_loss_mlp": 1.09249496, + "epoch": 0.07445171219699885, + "flos": 504740630016.0, + "grad_norm": 0.05063210924529089, + "language_loss": 1.0156467, + "learning_rate": 0.0009948297904714782, + "loss": 1.02678132, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.20996094, + "step": 387, + "time_per_iteration": 2.668027639389038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097529, + "balance_loss_mlp": 1.07642913, + "epoch": 0.07464409388226241, + "flos": 553693515264.0, + "grad_norm": 0.06830922509793466, + "language_loss": 0.93493366, + "learning_rate": 0.0009947850080064796, + "loss": 0.9459089, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.21105957, + "step": 388, + "time_per_iteration": 2.79836106300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098078, + "balance_loss_mlp": 1.07695365, + "epoch": 0.07483647556752597, + "flos": 776511028224.0, + "grad_norm": 0.06471398355705121, + "language_loss": 0.98276728, + "learning_rate": 0.0009947400334489047, + "loss": 0.99374807, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.21130371, + "step": 389, + "time_per_iteration": 3.0046355724334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095267, + "balance_loss_mlp": 1.07513261, + "epoch": 0.07502885725278953, + "flos": 612256596480.0, + "grad_norm": 0.0754939105077014, + "language_loss": 0.90272582, + "learning_rate": 0.0009946948668162145, + "loss": 0.91367853, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.20141602, + "step": 390, + "time_per_iteration": 2.724792003631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091157, + "balance_loss_mlp": 1.06946135, + "epoch": 0.0752212389380531, + "flos": 688324552704.0, + "grad_norm": 0.05626120625508035, + "language_loss": 0.9463594, + "learning_rate": 0.0009946495081259441, + "loss": 0.95727098, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.21704102, + "step": 391, + "time_per_iteration": 2.8221397399902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101684, + "balance_loss_mlp": 1.08008361, + "epoch": 0.07541362062331666, + "flos": 765362967552.0, + "grad_norm": 0.09729902751759628, + "language_loss": 0.97468722, + "learning_rate": 0.0009946039573957035, + "loss": 0.98570406, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.21606445, + "step": 392, + "time_per_iteration": 2.958655595779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095785, + "balance_loss_mlp": 1.07572174, + "epoch": 0.07560600230858022, + "flos": 588460336128.0, + "grad_norm": 0.06468718689622391, + "language_loss": 0.94257009, + "learning_rate": 0.000994558214643177, + "loss": 0.95352793, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.20056152, + "step": 393, + "time_per_iteration": 2.752979040145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101382, + "balance_loss_mlp": 1.08086586, + "epoch": 0.07579838399384378, + "flos": 749508028416.0, + "grad_norm": 0.06635223139616171, + "language_loss": 0.961483, + "learning_rate": 0.000994512279886123, + "loss": 0.97249681, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.20532227, + "step": 394, + "time_per_iteration": 3.055225133895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104661, + "balance_loss_mlp": 1.08346581, + "epoch": 0.07599076567910736, + "flos": 523185440256.0, + "grad_norm": 0.06901630142642712, + "language_loss": 0.96749192, + "learning_rate": 0.0009944661531423758, + "loss": 0.97853857, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.2121582, + "step": 395, + "time_per_iteration": 2.6922085285186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093271, + "balance_loss_mlp": 1.07248056, + "epoch": 0.07618314736437092, + "flos": 550812662784.0, + "grad_norm": 0.07064334209039194, + "language_loss": 0.95375401, + "learning_rate": 0.000994419834429843, + "loss": 0.96468663, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.20788574, + "step": 396, + "time_per_iteration": 2.6657333374023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092352, + "balance_loss_mlp": 1.0716933, + "epoch": 0.07637552904963447, + "flos": 697901253120.0, + "grad_norm": 0.07324881108467876, + "language_loss": 0.99580455, + "learning_rate": 0.0009943733237665069, + "loss": 1.00672793, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.20654297, + "step": 397, + "time_per_iteration": 2.8662500381469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085112, + "balance_loss_mlp": 1.06454849, + "epoch": 0.07656791073489803, + "flos": 579066928128.0, + "grad_norm": 0.04790317238997088, + "language_loss": 0.98118353, + "learning_rate": 0.0009943266211704248, + "loss": 0.99203461, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.20568848, + "step": 398, + "time_per_iteration": 2.930741786956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094784, + "balance_loss_mlp": 1.07348132, + "epoch": 0.0767602924201616, + "flos": 416923711488.0, + "grad_norm": 0.09980331544781734, + "language_loss": 1.00422275, + "learning_rate": 0.000994279726659728, + "loss": 1.01517057, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.21325684, + "step": 399, + "time_per_iteration": 2.533738851547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109036, + "balance_loss_mlp": 1.06970143, + "epoch": 0.07695267410542517, + "flos": 482671471104.0, + "grad_norm": 0.06967700921129397, + "language_loss": 0.97985041, + "learning_rate": 0.0009942326402526231, + "loss": 0.99075395, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.20666504, + "step": 400, + "time_per_iteration": 2.51460337638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096542, + "balance_loss_mlp": 1.07526302, + "epoch": 0.07714505579068873, + "flos": 530742647808.0, + "grad_norm": 0.052652305799428985, + "language_loss": 0.96639109, + "learning_rate": 0.0009941853619673902, + "loss": 0.97735649, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.2130127, + "step": 401, + "time_per_iteration": 2.620939016342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101036, + "balance_loss_mlp": 1.08012676, + "epoch": 0.07733743747595229, + "flos": 804635845632.0, + "grad_norm": 0.07273299487754427, + "language_loss": 0.99959278, + "learning_rate": 0.0009941378918223844, + "loss": 1.01060319, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.20910645, + "step": 402, + "time_per_iteration": 3.036839008331299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110477, + "balance_loss_mlp": 1.08423018, + "epoch": 0.07752981916121585, + "flos": 622192679424.0, + "grad_norm": 0.05767312217272775, + "language_loss": 0.93044209, + "learning_rate": 0.0009940902298360354, + "loss": 0.94148982, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.20544434, + "step": 403, + "time_per_iteration": 2.7703943252563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097477, + "balance_loss_mlp": 1.07694876, + "epoch": 0.07772220084647942, + "flos": 727961195520.0, + "grad_norm": 0.0686344305115436, + "language_loss": 1.02037048, + "learning_rate": 0.0009940423760268473, + "loss": 1.03134525, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.2052002, + "step": 404, + "time_per_iteration": 2.8823602199554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085621, + "balance_loss_mlp": 1.06497431, + "epoch": 0.07791458253174298, + "flos": 555149984256.0, + "grad_norm": 0.10727031409308073, + "language_loss": 0.96142864, + "learning_rate": 0.0009939943304133982, + "loss": 0.97228479, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.20654297, + "step": 405, + "time_per_iteration": 2.63908314704895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078944, + "balance_loss_mlp": 1.05944133, + "epoch": 0.07810696421700654, + "flos": 552919495680.0, + "grad_norm": 0.08981509362846728, + "language_loss": 1.0302707, + "learning_rate": 0.0009939460930143416, + "loss": 1.04106021, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.19482422, + "step": 406, + "time_per_iteration": 2.63259220123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079269, + "balance_loss_mlp": 1.05927801, + "epoch": 0.0782993459022701, + "flos": 650323289088.0, + "grad_norm": 0.07212254231156982, + "language_loss": 0.96910775, + "learning_rate": 0.0009938976638484043, + "loss": 0.97990054, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.1998291, + "step": 407, + "time_per_iteration": 2.9489452838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073065, + "balance_loss_mlp": 1.05239439, + "epoch": 0.07849172758753367, + "flos": 495926364672.0, + "grad_norm": 0.07302041560946317, + "language_loss": 0.9619081, + "learning_rate": 0.0009938490429343887, + "loss": 0.97263873, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.20666504, + "step": 408, + "time_per_iteration": 2.541293144226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078297, + "balance_loss_mlp": 1.05823374, + "epoch": 0.07868410927279723, + "flos": 577696389120.0, + "grad_norm": 0.06961121210328268, + "language_loss": 0.96404505, + "learning_rate": 0.0009938002302911709, + "loss": 0.974828, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.20056152, + "step": 409, + "time_per_iteration": 2.7890634536743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078628, + "balance_loss_mlp": 1.05869615, + "epoch": 0.07887649095806079, + "flos": 522698019840.0, + "grad_norm": 0.10283598941623227, + "language_loss": 0.99080813, + "learning_rate": 0.0009937512259377015, + "loss": 1.00159442, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.19921875, + "step": 410, + "time_per_iteration": 2.6631360054016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076374, + "balance_loss_mlp": 1.05739617, + "epoch": 0.07906887264332435, + "flos": 556958481408.0, + "grad_norm": 0.07518465865945036, + "language_loss": 0.97744381, + "learning_rate": 0.000993702029893006, + "loss": 0.98820746, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.18981934, + "step": 411, + "time_per_iteration": 2.762937068939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070708, + "balance_loss_mlp": 1.0512886, + "epoch": 0.07926125432858792, + "flos": 821641715712.0, + "grad_norm": 0.06547583340109177, + "language_loss": 0.97466588, + "learning_rate": 0.0009936526421761838, + "loss": 0.98537302, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.1940918, + "step": 412, + "time_per_iteration": 3.019529342651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070741, + "balance_loss_mlp": 1.05210841, + "epoch": 0.07945363601385148, + "flos": 562072794624.0, + "grad_norm": 0.06412617323579047, + "language_loss": 0.9993977, + "learning_rate": 0.000993603062806409, + "loss": 1.01010513, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.18615723, + "step": 413, + "time_per_iteration": 2.667893409729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078833, + "balance_loss_mlp": 1.05879402, + "epoch": 0.07964601769911504, + "flos": 517615792128.0, + "grad_norm": 0.0777298152120257, + "language_loss": 1.03187037, + "learning_rate": 0.0009935532918029298, + "loss": 1.04265857, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.20031738, + "step": 414, + "time_per_iteration": 2.628847122192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079604, + "balance_loss_mlp": 1.06020916, + "epoch": 0.0798383993843786, + "flos": 538956011520.0, + "grad_norm": 0.0762846382616791, + "language_loss": 0.96381676, + "learning_rate": 0.0009935033291850694, + "loss": 0.97461283, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.19384766, + "step": 415, + "time_per_iteration": 2.6874804496765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078311, + "balance_loss_mlp": 1.05915451, + "epoch": 0.08003078106964218, + "flos": 484901959680.0, + "grad_norm": 0.07548152614126195, + "language_loss": 0.9874112, + "learning_rate": 0.0009934531749722247, + "loss": 0.9981944, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.19177246, + "step": 416, + "time_per_iteration": 2.5752930641174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077721, + "balance_loss_mlp": 1.0581702, + "epoch": 0.08022316275490574, + "flos": 517999905792.0, + "grad_norm": 0.07373378819853486, + "language_loss": 0.97326815, + "learning_rate": 0.0009934028291838672, + "loss": 0.98404539, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.1953125, + "step": 417, + "time_per_iteration": 2.715142011642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078885, + "balance_loss_mlp": 1.0593344, + "epoch": 0.0804155444401693, + "flos": 493755512832.0, + "grad_norm": 0.06878732968267398, + "language_loss": 0.9290086, + "learning_rate": 0.0009933522918395433, + "loss": 0.93979746, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.19555664, + "step": 418, + "time_per_iteration": 2.7008063793182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01673141, + "balance_loss_mlp": 1.6505394, + "epoch": 0.08060792612543285, + "flos": 1580567579136.0, + "grad_norm": 0.10865535097535944, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.7992425, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.22558594, + "step": 419, + "time_per_iteration": 4.854820728302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092516, + "balance_loss_mlp": 1.07238102, + "epoch": 0.08080030781069643, + "flos": 525090041856.0, + "grad_norm": 0.07888672823303539, + "language_loss": 1.11010027, + "learning_rate": 0.000993250642561551, + "loss": 1.12102532, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.20129395, + "step": 420, + "time_per_iteration": 2.6152822971343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102725, + "balance_loss_mlp": 1.08251905, + "epoch": 0.08099268949595999, + "flos": 546459374592.0, + "grad_norm": 0.06927423279576624, + "language_loss": 0.96781242, + "learning_rate": 0.0009931995306673466, + "loss": 0.97883964, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.20202637, + "step": 421, + "time_per_iteration": 2.8378820419311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107938, + "balance_loss_mlp": 1.08725524, + "epoch": 0.08118507118122355, + "flos": 510116811264.0, + "grad_norm": 0.07245841989657228, + "language_loss": 1.01691484, + "learning_rate": 0.000993148227296103, + "loss": 1.02799416, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.20678711, + "step": 422, + "time_per_iteration": 2.6234657764434814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109153, + "balance_loss_mlp": 1.08827925, + "epoch": 0.08137745286648711, + "flos": 720339969024.0, + "grad_norm": 0.06440268991377437, + "language_loss": 0.90059143, + "learning_rate": 0.000993096732467738, + "loss": 0.91168296, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.2088623, + "step": 423, + "time_per_iteration": 2.9789979457855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107042, + "balance_loss_mlp": 1.08620405, + "epoch": 0.08156983455175067, + "flos": 679313848320.0, + "grad_norm": 0.09430690436493987, + "language_loss": 0.97591221, + "learning_rate": 0.0009930450462022435, + "loss": 0.9869827, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.20837402, + "step": 424, + "time_per_iteration": 2.7870407104492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01731933, + "balance_loss_mlp": 1.70847309, + "epoch": 0.08176221623701424, + "flos": 1452577135104.0, + "grad_norm": 0.13164555017172178, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80921739, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.234375, + "step": 425, + "time_per_iteration": 4.870323181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095108, + "balance_loss_mlp": 1.07456827, + "epoch": 0.0819545979222778, + "flos": 1556034071040.0, + "grad_norm": 0.10298759083167684, + "language_loss": 0.95328236, + "learning_rate": 0.0009929410994402065, + "loss": 0.9642334, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.20544434, + "step": 426, + "time_per_iteration": 3.7942585945129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093366, + "balance_loss_mlp": 1.07214665, + "epoch": 0.08214697960754136, + "flos": 512456398848.0, + "grad_norm": 0.069672302328133, + "language_loss": 0.99507213, + "learning_rate": 0.0009928888389840196, + "loss": 1.00600576, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.21240234, + "step": 427, + "time_per_iteration": 2.684760093688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073876, + "balance_loss_mlp": 1.05376494, + "epoch": 0.08233936129280492, + "flos": 594850646016.0, + "grad_norm": 0.07796900075206671, + "language_loss": 1.01749206, + "learning_rate": 0.0009928363871714147, + "loss": 1.02823079, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.20092773, + "step": 428, + "time_per_iteration": 2.6608195304870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078126, + "balance_loss_mlp": 1.05796742, + "epoch": 0.08253174297806849, + "flos": 571758594048.0, + "grad_norm": 0.07341701057973313, + "language_loss": 0.95524251, + "learning_rate": 0.0009927837440227556, + "loss": 0.96602374, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.20153809, + "step": 429, + "time_per_iteration": 2.824958324432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083273, + "balance_loss_mlp": 1.06413972, + "epoch": 0.08272412466333205, + "flos": 623065623552.0, + "grad_norm": 0.06194570532237157, + "language_loss": 0.90308964, + "learning_rate": 0.0009927309095584798, + "loss": 0.91392243, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.19128418, + "step": 430, + "time_per_iteration": 2.9565205574035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105878, + "balance_loss_mlp": 1.08643484, + "epoch": 0.08291650634859561, + "flos": 513745542144.0, + "grad_norm": 0.09375416706629437, + "language_loss": 1.0225904, + "learning_rate": 0.0009926778837991, + "loss": 1.03364921, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.19433594, + "step": 431, + "time_per_iteration": 2.5606777667999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104802, + "balance_loss_mlp": 1.08521628, + "epoch": 0.08310888803385917, + "flos": 667073083392.0, + "grad_norm": 0.09022222071598751, + "language_loss": 1.00445497, + "learning_rate": 0.000992624666765202, + "loss": 1.01550293, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.19580078, + "step": 432, + "time_per_iteration": 2.763514995574951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112312, + "balance_loss_mlp": 1.09166527, + "epoch": 0.08330126971912274, + "flos": 582995404800.0, + "grad_norm": 0.07142121215748316, + "language_loss": 0.98131895, + "learning_rate": 0.000992571258477447, + "loss": 0.99244213, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.20654297, + "step": 433, + "time_per_iteration": 2.7823588848114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086622, + "balance_loss_mlp": 1.06731021, + "epoch": 0.0834936514043863, + "flos": 561064458240.0, + "grad_norm": 0.06618743000622296, + "language_loss": 0.92206728, + "learning_rate": 0.0009925176589565695, + "loss": 0.93293345, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.1932373, + "step": 434, + "time_per_iteration": 2.7774362564086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109069, + "balance_loss_mlp": 1.07043648, + "epoch": 0.08368603308964986, + "flos": 494272046592.0, + "grad_norm": 0.07800081613857189, + "language_loss": 1.01949787, + "learning_rate": 0.0009924638682233791, + "loss": 1.03040481, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.20251465, + "step": 435, + "time_per_iteration": 2.574716091156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236801, + "balance_loss_mlp": 1.21505737, + "epoch": 0.08387841477491342, + "flos": 1388322312192.0, + "grad_norm": 0.08820287098199171, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80801398, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.21777344, + "step": 436, + "time_per_iteration": 4.521069049835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087939, + "balance_loss_mlp": 1.06750691, + "epoch": 0.084070796460177, + "flos": 798642796032.0, + "grad_norm": 0.09737991847895365, + "language_loss": 0.92070073, + "learning_rate": 0.0009923557132036668, + "loss": 0.93158013, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.2043457, + "step": 437, + "time_per_iteration": 3.0401971340179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106896, + "balance_loss_mlp": 1.08635592, + "epoch": 0.08426317814544056, + "flos": 558681200640.0, + "grad_norm": 0.07082709395687636, + "language_loss": 0.96077365, + "learning_rate": 0.0009923013489591345, + "loss": 0.97184265, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.20532227, + "step": 438, + "time_per_iteration": 2.7388038635253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138911, + "balance_loss_mlp": 1.11965871, + "epoch": 0.08445555983070412, + "flos": 810057106944.0, + "grad_norm": 0.09946092642967543, + "language_loss": 0.94659293, + "learning_rate": 0.0009922467935862681, + "loss": 0.95798206, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.19250488, + "step": 439, + "time_per_iteration": 3.0827929973602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153103, + "balance_loss_mlp": 1.13278937, + "epoch": 0.08464794151596768, + "flos": 509939311104.0, + "grad_norm": 0.08658230076015333, + "language_loss": 0.97196984, + "learning_rate": 0.0009921920471062478, + "loss": 0.9835009, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.203125, + "step": 440, + "time_per_iteration": 2.5667247772216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109293, + "balance_loss_mlp": 1.08952785, + "epoch": 0.08484032320123125, + "flos": 556149556224.0, + "grad_norm": 0.0779492699350581, + "language_loss": 0.95526892, + "learning_rate": 0.0009921371095403281, + "loss": 0.96636182, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.19763184, + "step": 441, + "time_per_iteration": 2.6504476070404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081558, + "balance_loss_mlp": 1.06137586, + "epoch": 0.08503270488649481, + "flos": 527103742464.0, + "grad_norm": 0.0823758421396894, + "language_loss": 0.98291612, + "learning_rate": 0.0009920819809098379, + "loss": 0.99373174, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.20166016, + "step": 442, + "time_per_iteration": 2.5884947776794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076633, + "balance_loss_mlp": 1.05612862, + "epoch": 0.08522508657175837, + "flos": 613989490176.0, + "grad_norm": 0.07828377396362728, + "language_loss": 0.94043314, + "learning_rate": 0.0009920266612361798, + "loss": 0.95119947, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.20507812, + "step": 443, + "time_per_iteration": 2.7464845180511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077144, + "balance_loss_mlp": 1.05650926, + "epoch": 0.08541746825702193, + "flos": 619495119360.0, + "grad_norm": 0.07442656272719532, + "language_loss": 0.94335687, + "learning_rate": 0.0009919711505408308, + "loss": 0.95412827, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.2064209, + "step": 444, + "time_per_iteration": 2.7615623474121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092391, + "balance_loss_mlp": 1.07126665, + "epoch": 0.08560984994228549, + "flos": 482671471104.0, + "grad_norm": 0.08601843511227286, + "language_loss": 0.92049706, + "learning_rate": 0.000991915448845342, + "loss": 0.93142092, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.21130371, + "step": 445, + "time_per_iteration": 2.519644260406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103035, + "balance_loss_mlp": 1.08145857, + "epoch": 0.08580223162754906, + "flos": 516897027072.0, + "grad_norm": 0.07781715705073443, + "language_loss": 1.01207459, + "learning_rate": 0.000991859556171339, + "loss": 1.02310491, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.21569824, + "step": 446, + "time_per_iteration": 2.5678694248199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116922, + "balance_loss_mlp": 1.09462976, + "epoch": 0.08599461331281262, + "flos": 531215511552.0, + "grad_norm": 0.11213971543052093, + "language_loss": 1.02931881, + "learning_rate": 0.000991803472540521, + "loss": 1.040488, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.22302246, + "step": 447, + "time_per_iteration": 2.6309196949005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124555, + "balance_loss_mlp": 1.10302639, + "epoch": 0.08618699499807618, + "flos": 789966743040.0, + "grad_norm": 0.07287006723198586, + "language_loss": 0.97443926, + "learning_rate": 0.0009917471979746615, + "loss": 0.98568487, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.21533203, + "step": 448, + "time_per_iteration": 2.9742491245269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134564, + "balance_loss_mlp": 1.11266506, + "epoch": 0.08637937668333974, + "flos": 565707317760.0, + "grad_norm": 0.08202115093309782, + "language_loss": 0.97199845, + "learning_rate": 0.0009916907324956086, + "loss": 0.98334408, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.21923828, + "step": 449, + "time_per_iteration": 2.704089641571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151497, + "balance_loss_mlp": 1.12693954, + "epoch": 0.08657175836860331, + "flos": 444930665472.0, + "grad_norm": 0.09325215593581063, + "language_loss": 0.93441564, + "learning_rate": 0.0009916340761252837, + "loss": 0.9459306, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.24536133, + "step": 450, + "time_per_iteration": 2.5866575241088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158359, + "balance_loss_mlp": 1.13567328, + "epoch": 0.08676414005386687, + "flos": 843789450240.0, + "grad_norm": 0.23711660967347972, + "language_loss": 0.90976942, + "learning_rate": 0.0009915772288856832, + "loss": 0.92135304, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.22668457, + "step": 451, + "time_per_iteration": 3.109010696411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118071, + "balance_loss_mlp": 1.15827537, + "epoch": 0.08695652173913043, + "flos": 602995608576.0, + "grad_norm": 0.08699490701012727, + "language_loss": 0.92036849, + "learning_rate": 0.000991520190798877, + "loss": 0.93217564, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.22424316, + "step": 452, + "time_per_iteration": 2.8523812294006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181191, + "balance_loss_mlp": 1.15807629, + "epoch": 0.08714890342439399, + "flos": 730423028736.0, + "grad_norm": 0.09293440668835976, + "language_loss": 1.01637089, + "learning_rate": 0.0009914629618870089, + "loss": 1.02818286, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.23095703, + "step": 453, + "time_per_iteration": 2.882887125015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142362, + "balance_loss_mlp": 1.12891519, + "epoch": 0.08734128510965757, + "flos": 1481518232064.0, + "grad_norm": 0.0645312523276542, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79818237, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.13476562, + "step": 454, + "time_per_iteration": 4.717878103256226 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103219, + "balance_loss_mlp": 1.09034455, + "epoch": 0.08753366679492113, + "flos": 1522214083584.0, + "grad_norm": 0.04274098512475534, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82531178, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.12890625, + "step": 455, + "time_per_iteration": 4.838243246078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171645, + "balance_loss_mlp": 1.14951944, + "epoch": 0.08772604848018468, + "flos": 720935078400.0, + "grad_norm": 0.10543082910841049, + "language_loss": 0.94423014, + "learning_rate": 0.0009912901304235883, + "loss": 0.95594656, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.22131348, + "step": 456, + "time_per_iteration": 2.9432015419006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150762, + "balance_loss_mlp": 1.12861252, + "epoch": 0.08791843016544824, + "flos": 707926086144.0, + "grad_norm": 0.10980567381029156, + "language_loss": 0.91300154, + "learning_rate": 0.000991232138434397, + "loss": 0.92450917, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.22143555, + "step": 457, + "time_per_iteration": 2.832761526107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113929, + "balance_loss_mlp": 1.09195828, + "epoch": 0.08811081185071182, + "flos": 472799407104.0, + "grad_norm": 0.1324680836731367, + "language_loss": 0.97845554, + "learning_rate": 0.000991173955731976, + "loss": 0.98959482, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.21960449, + "step": 458, + "time_per_iteration": 2.660696506500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100778, + "balance_loss_mlp": 1.07958269, + "epoch": 0.08830319353597538, + "flos": 684647769600.0, + "grad_norm": 0.07138233575581546, + "language_loss": 1.0178268, + "learning_rate": 0.0009911155823389137, + "loss": 1.02883458, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.21203613, + "step": 459, + "time_per_iteration": 2.9878122806549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105128, + "balance_loss_mlp": 1.08344412, + "epoch": 0.08849557522123894, + "flos": 573235411968.0, + "grad_norm": 0.0735053314112025, + "language_loss": 0.9764787, + "learning_rate": 0.000991057018277873, + "loss": 0.98752999, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.21679688, + "step": 460, + "time_per_iteration": 2.707247018814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116963, + "balance_loss_mlp": 1.09422946, + "epoch": 0.0886879569065025, + "flos": 564303283200.0, + "grad_norm": 0.10552034142073316, + "language_loss": 0.9759655, + "learning_rate": 0.0009909982635715898, + "loss": 0.98713505, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.22729492, + "step": 461, + "time_per_iteration": 2.609016180038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120097, + "balance_loss_mlp": 1.09760189, + "epoch": 0.08888033859176607, + "flos": 563609249280.0, + "grad_norm": 0.09185893532484944, + "language_loss": 0.96625364, + "learning_rate": 0.0009909393182428751, + "loss": 0.97745454, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.22497559, + "step": 462, + "time_per_iteration": 2.682616949081421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116703, + "balance_loss_mlp": 1.09437466, + "epoch": 0.08907272027702963, + "flos": 465517214208.0, + "grad_norm": 0.08888403374641002, + "language_loss": 0.91300213, + "learning_rate": 0.000990880182314614, + "loss": 0.92416912, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.22314453, + "step": 463, + "time_per_iteration": 2.732579469680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122985, + "balance_loss_mlp": 1.10014486, + "epoch": 0.08926510196229319, + "flos": 681200921088.0, + "grad_norm": 0.07408309604525525, + "language_loss": 0.92294347, + "learning_rate": 0.0009908208558097643, + "loss": 0.93417335, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.22839355, + "step": 464, + "time_per_iteration": 2.910313606262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137827, + "balance_loss_mlp": 1.115273, + "epoch": 0.08945748364755675, + "flos": 596411831808.0, + "grad_norm": 0.08673846989427919, + "language_loss": 0.93827909, + "learning_rate": 0.000990761338751359, + "loss": 0.94965738, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.22546387, + "step": 465, + "time_per_iteration": 2.827570676803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133815, + "balance_loss_mlp": 1.12222791, + "epoch": 0.08964986533282032, + "flos": 1585082400768.0, + "grad_norm": 0.06082202694548154, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74793446, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.11572266, + "step": 466, + "time_per_iteration": 4.960917234420776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177765, + "balance_loss_mlp": 1.15419745, + "epoch": 0.08984224701808388, + "flos": 533268499968.0, + "grad_norm": 0.4900596090566038, + "language_loss": 0.96587038, + "learning_rate": 0.0009906417330663815, + "loss": 0.97764802, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.23571777, + "step": 467, + "time_per_iteration": 2.5937299728393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157084, + "balance_loss_mlp": 1.13383865, + "epoch": 0.09003462870334744, + "flos": 478702296576.0, + "grad_norm": 0.08613132202477504, + "language_loss": 0.92798859, + "learning_rate": 0.0009905816444862442, + "loss": 0.93955946, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.23217773, + "step": 468, + "time_per_iteration": 2.6012237071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164868, + "balance_loss_mlp": 1.14150274, + "epoch": 0.090227010388611, + "flos": 653307448320.0, + "grad_norm": 0.08218040805372613, + "language_loss": 0.90769458, + "learning_rate": 0.0009905213654454216, + "loss": 0.91934329, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.23364258, + "step": 469, + "time_per_iteration": 2.8727760314941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176439, + "balance_loss_mlp": 1.15152478, + "epoch": 0.09041939207387456, + "flos": 617894645760.0, + "grad_norm": 0.09256259391525869, + "language_loss": 0.97864139, + "learning_rate": 0.0009904608959673158, + "loss": 0.9904058, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.24938965, + "step": 470, + "time_per_iteration": 2.7991952896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151805, + "balance_loss_mlp": 1.12671185, + "epoch": 0.09061177375913813, + "flos": 454137808896.0, + "grad_norm": 0.09693984756275055, + "language_loss": 0.97988749, + "learning_rate": 0.000990400236075403, + "loss": 0.99140555, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.25109863, + "step": 471, + "time_per_iteration": 2.523508310317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125394, + "balance_loss_mlp": 1.10119498, + "epoch": 0.0908041554444017, + "flos": 543982984704.0, + "grad_norm": 0.09250187628709369, + "language_loss": 0.9490509, + "learning_rate": 0.0009903393857932338, + "loss": 0.96030486, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.24194336, + "step": 472, + "time_per_iteration": 2.7065584659576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124084, + "balance_loss_mlp": 1.09912193, + "epoch": 0.09099653712966525, + "flos": 564052999680.0, + "grad_norm": 0.10897832311722938, + "language_loss": 0.93660218, + "learning_rate": 0.0009902783451444317, + "loss": 0.94784307, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.24963379, + "step": 473, + "time_per_iteration": 2.7067277431488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108649, + "balance_loss_mlp": 1.08496177, + "epoch": 0.09118891881492881, + "flos": 474300956160.0, + "grad_norm": 0.09402902414949979, + "language_loss": 0.97273493, + "learning_rate": 0.0009902171141526956, + "loss": 0.98382139, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.23693848, + "step": 474, + "time_per_iteration": 2.5281569957733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087186, + "balance_loss_mlp": 1.06240201, + "epoch": 0.09138130050019239, + "flos": 545579076096.0, + "grad_norm": 0.06728788346792411, + "language_loss": 0.85273343, + "learning_rate": 0.000990155692841797, + "loss": 0.86360526, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.2479248, + "step": 475, + "time_per_iteration": 2.970107316970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010835, + "balance_loss_mlp": 1.0587163, + "epoch": 0.09157368218545595, + "flos": 732397441536.0, + "grad_norm": 0.07226189405033341, + "language_loss": 0.97062063, + "learning_rate": 0.0009900940812355818, + "loss": 0.98145562, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.24768066, + "step": 476, + "time_per_iteration": 2.959184169769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096233, + "balance_loss_mlp": 1.07208097, + "epoch": 0.0917660638707195, + "flos": 610709967360.0, + "grad_norm": 0.09034653129128065, + "language_loss": 0.92824447, + "learning_rate": 0.00099003227935797, + "loss": 0.93920678, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.24157715, + "step": 477, + "time_per_iteration": 2.7553765773773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113263, + "balance_loss_mlp": 1.08839583, + "epoch": 0.09195844555598306, + "flos": 655561257984.0, + "grad_norm": 0.09830094540804109, + "language_loss": 0.95358098, + "learning_rate": 0.000989970287232955, + "loss": 0.96471357, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.2487793, + "step": 478, + "time_per_iteration": 2.7916457653045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112064, + "balance_loss_mlp": 1.09633327, + "epoch": 0.09215082724124664, + "flos": 476339387904.0, + "grad_norm": 0.08054303064285366, + "language_loss": 0.93560576, + "learning_rate": 0.0009899081048846043, + "loss": 0.94681215, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.24267578, + "step": 479, + "time_per_iteration": 2.554161787033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114732, + "balance_loss_mlp": 1.12177348, + "epoch": 0.0923432089265102, + "flos": 524051182080.0, + "grad_norm": 0.1186512856896222, + "language_loss": 0.97593725, + "learning_rate": 0.0009898457323370593, + "loss": 0.98741049, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.25549316, + "step": 480, + "time_per_iteration": 2.5794191360473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131558, + "balance_loss_mlp": 1.10608315, + "epoch": 0.09253559061177376, + "flos": 545302651392.0, + "grad_norm": 0.10688941209840569, + "language_loss": 0.96892118, + "learning_rate": 0.000989783169614535, + "loss": 0.98023689, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.25512695, + "step": 481, + "time_per_iteration": 2.6676101684570312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336494, + "balance_loss_mlp": 1.32304764, + "epoch": 0.09272797229703732, + "flos": 1537222219776.0, + "grad_norm": 0.112558059824644, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80089253, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.13476562, + "step": 482, + "time_per_iteration": 4.910710096359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121205, + "balance_loss_mlp": 1.09537172, + "epoch": 0.09292035398230089, + "flos": 689501624832.0, + "grad_norm": 0.08905484371867754, + "language_loss": 0.93989253, + "learning_rate": 0.000989657473741779, + "loss": 0.95110452, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.25866699, + "step": 483, + "time_per_iteration": 2.8736467361450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120092, + "balance_loss_mlp": 1.09219658, + "epoch": 0.09311273566756445, + "flos": 509482414080.0, + "grad_norm": 0.10011855628381364, + "language_loss": 0.94861096, + "learning_rate": 0.0009895943406403465, + "loss": 0.95981193, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.27905273, + "step": 484, + "time_per_iteration": 2.7233312129974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114409, + "balance_loss_mlp": 1.08641887, + "epoch": 0.09330511735282801, + "flos": 659111413248.0, + "grad_norm": 0.10884122740481975, + "language_loss": 0.87602448, + "learning_rate": 0.0009895310174615338, + "loss": 0.88716859, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.2800293, + "step": 485, + "time_per_iteration": 2.7538061141967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098211, + "balance_loss_mlp": 1.08533621, + "epoch": 0.09349749903809157, + "flos": 1452054809088.0, + "grad_norm": 0.04867374252302138, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76816726, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.12890625, + "step": 486, + "time_per_iteration": 4.681119441986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121456, + "balance_loss_mlp": 1.09291732, + "epoch": 0.09368988072335514, + "flos": 520614508032.0, + "grad_norm": 0.07858969791005947, + "language_loss": 0.92458618, + "learning_rate": 0.0009894038009701782, + "loss": 0.93580067, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.28515625, + "step": 487, + "time_per_iteration": 2.6114649772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153128, + "balance_loss_mlp": 1.12148952, + "epoch": 0.0938822624086187, + "flos": 497502107136.0, + "grad_norm": 0.11959755259003642, + "language_loss": 0.91595036, + "learning_rate": 0.0009893399077070253, + "loss": 0.92748165, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.31616211, + "step": 488, + "time_per_iteration": 2.5603692531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127952, + "balance_loss_mlp": 1.09845996, + "epoch": 0.09407464409388226, + "flos": 532948405248.0, + "grad_norm": 0.09098963794592498, + "language_loss": 0.89760649, + "learning_rate": 0.0009892758244652718, + "loss": 0.90888608, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.29516602, + "step": 489, + "time_per_iteration": 2.65938401222229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127724, + "balance_loss_mlp": 1.09568012, + "epoch": 0.09426702577914582, + "flos": 585736634880.0, + "grad_norm": 0.09102778373185845, + "language_loss": 0.94519842, + "learning_rate": 0.0009892115512697968, + "loss": 0.95647562, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.3203125, + "step": 490, + "time_per_iteration": 2.6538186073303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120065, + "balance_loss_mlp": 1.08926105, + "epoch": 0.0944594074644094, + "flos": 503081929728.0, + "grad_norm": 0.07724049493821064, + "language_loss": 0.96624851, + "learning_rate": 0.0009891470881455537, + "loss": 0.97744912, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.30810547, + "step": 491, + "time_per_iteration": 2.699535608291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122711, + "balance_loss_mlp": 1.09145451, + "epoch": 0.09465178914967295, + "flos": 570748847616.0, + "grad_norm": 0.0816499633869022, + "language_loss": 0.94510269, + "learning_rate": 0.0009890824351175692, + "loss": 0.95632982, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.31225586, + "step": 492, + "time_per_iteration": 2.678191661834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125893, + "balance_loss_mlp": 1.09418344, + "epoch": 0.09484417083493651, + "flos": 549098707968.0, + "grad_norm": 0.07977284094064935, + "language_loss": 0.98609412, + "learning_rate": 0.0009890175922109435, + "loss": 0.99735302, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.31689453, + "step": 493, + "time_per_iteration": 2.6466987133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138627, + "balance_loss_mlp": 1.10534418, + "epoch": 0.09503655252020007, + "flos": 823552109568.0, + "grad_norm": 0.09331424233507904, + "language_loss": 0.96939492, + "learning_rate": 0.0009889525594508513, + "loss": 0.9807812, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.33300781, + "step": 494, + "time_per_iteration": 3.009894371032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153225, + "balance_loss_mlp": 1.12218332, + "epoch": 0.09522893420546363, + "flos": 404397757440.0, + "grad_norm": 0.08141129996203125, + "language_loss": 0.91043431, + "learning_rate": 0.0009888873368625404, + "loss": 0.92196655, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.31030273, + "step": 495, + "time_per_iteration": 2.4904890060424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171726, + "balance_loss_mlp": 1.14025438, + "epoch": 0.0954213158907272, + "flos": 690707810304.0, + "grad_norm": 0.08256479818708104, + "language_loss": 0.94339681, + "learning_rate": 0.0009888219244713326, + "loss": 0.95511413, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.31445312, + "step": 496, + "time_per_iteration": 2.8060483932495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181664, + "balance_loss_mlp": 1.15033531, + "epoch": 0.09561369757599077, + "flos": 518739019776.0, + "grad_norm": 0.10472312979641793, + "language_loss": 0.94370055, + "learning_rate": 0.0009887563223026229, + "loss": 0.95551717, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.31323242, + "step": 497, + "time_per_iteration": 2.6536803245544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228939, + "balance_loss_mlp": 1.21549225, + "epoch": 0.09580607926125433, + "flos": 1384825849344.0, + "grad_norm": 0.04877985805939708, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80297101, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.13476562, + "step": 498, + "time_per_iteration": 4.874605178833008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197245, + "balance_loss_mlp": 1.16455829, + "epoch": 0.09599846094651789, + "flos": 717090969600.0, + "grad_norm": 0.08863465655244346, + "language_loss": 0.93284124, + "learning_rate": 0.0009886245487346482, + "loss": 0.94481373, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.3269043, + "step": 499, + "time_per_iteration": 3.047938108444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011865, + "balance_loss_mlp": 1.15474319, + "epoch": 0.09619084263178146, + "flos": 385824909312.0, + "grad_norm": 0.09673466805801513, + "language_loss": 0.96238041, + "learning_rate": 0.0009885583773865422, + "loss": 0.97424543, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.31762695, + "step": 500, + "time_per_iteration": 2.402763843536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140705, + "balance_loss_mlp": 1.1099968, + "epoch": 0.09638322431704502, + "flos": 533869401600.0, + "grad_norm": 0.08556524095898377, + "language_loss": 0.93457472, + "learning_rate": 0.0009884920163632524, + "loss": 0.94598186, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.30688477, + "step": 501, + "time_per_iteration": 2.7420296669006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155853, + "balance_loss_mlp": 1.12373805, + "epoch": 0.09657560600230858, + "flos": 500426629632.0, + "grad_norm": 0.08462195742795481, + "language_loss": 0.95688182, + "learning_rate": 0.000988425465690543, + "loss": 0.96844035, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.32104492, + "step": 502, + "time_per_iteration": 2.5425736904144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163304, + "balance_loss_mlp": 1.13099861, + "epoch": 0.09676798768757214, + "flos": 528995197440.0, + "grad_norm": 0.07192036847451248, + "language_loss": 0.92721838, + "learning_rate": 0.0009883587253942505, + "loss": 0.93885148, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.32324219, + "step": 503, + "time_per_iteration": 2.8340742588043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188959, + "balance_loss_mlp": 1.15598607, + "epoch": 0.09696036937283571, + "flos": 463379857920.0, + "grad_norm": 0.0888689340699796, + "language_loss": 0.99166393, + "learning_rate": 0.0009882917955002862, + "loss": 1.00355351, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.32983398, + "step": 504, + "time_per_iteration": 2.560448169708252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147535, + "balance_loss_mlp": 1.11606395, + "epoch": 0.09715275105809927, + "flos": 534716204544.0, + "grad_norm": 0.07251663236407552, + "language_loss": 0.9150176, + "learning_rate": 0.0009882246760346343, + "loss": 0.92649293, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.31420898, + "step": 505, + "time_per_iteration": 2.6460299491882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114081, + "balance_loss_mlp": 1.10714495, + "epoch": 0.09734513274336283, + "flos": 454713979392.0, + "grad_norm": 0.10061537251918176, + "language_loss": 0.96100289, + "learning_rate": 0.0009881573670233533, + "loss": 0.97241098, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.33666992, + "step": 506, + "time_per_iteration": 2.5137040615081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109977, + "balance_loss_mlp": 1.08029366, + "epoch": 0.09753751442862639, + "flos": 508551243264.0, + "grad_norm": 0.0762964042901656, + "language_loss": 0.91185808, + "learning_rate": 0.0009880898684925747, + "loss": 0.92295784, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.29663086, + "step": 507, + "time_per_iteration": 2.6571738719940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110119, + "balance_loss_mlp": 1.07133985, + "epoch": 0.09772989611388996, + "flos": 484030425600.0, + "grad_norm": 0.07531505250568626, + "language_loss": 0.89554358, + "learning_rate": 0.0009880221804685037, + "loss": 0.90655547, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.29882812, + "step": 508, + "time_per_iteration": 2.596289873123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01404721, + "balance_loss_mlp": 1.39136958, + "epoch": 0.09792227779915352, + "flos": 1565306339328.0, + "grad_norm": 0.10151454340945995, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80749142, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.13378906, + "step": 509, + "time_per_iteration": 4.724441051483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116621, + "balance_loss_mlp": 1.08655643, + "epoch": 0.09811465948441708, + "flos": 587529165312.0, + "grad_norm": 0.08257009801201759, + "language_loss": 0.94708043, + "learning_rate": 0.0009878862360456733, + "loss": 0.95824659, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.30029297, + "step": 510, + "time_per_iteration": 2.703011989593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122701, + "balance_loss_mlp": 1.09406662, + "epoch": 0.09830704116968064, + "flos": 612719285760.0, + "grad_norm": 0.06191460590209878, + "language_loss": 0.88457662, + "learning_rate": 0.0009878179796996922, + "loss": 0.89580369, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.28637695, + "step": 511, + "time_per_iteration": 2.7212226390838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128587, + "balance_loss_mlp": 1.09885597, + "epoch": 0.09849942285494422, + "flos": 538528227840.0, + "grad_norm": 0.06874751685339883, + "language_loss": 0.9199326, + "learning_rate": 0.0009877495339659754, + "loss": 0.9312185, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.29724121, + "step": 512, + "time_per_iteration": 2.7520575523376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111609, + "balance_loss_mlp": 1.08826661, + "epoch": 0.09869180454020778, + "flos": 620193535488.0, + "grad_norm": 0.06953003964378547, + "language_loss": 0.87301105, + "learning_rate": 0.000987680898871096, + "loss": 0.88417196, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.27832031, + "step": 513, + "time_per_iteration": 2.7121992111206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134292, + "balance_loss_mlp": 1.10401261, + "epoch": 0.09888418622547133, + "flos": 811375363584.0, + "grad_norm": 0.1024184057853134, + "language_loss": 0.87763435, + "learning_rate": 0.0009876120744417, + "loss": 0.88897729, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.30273438, + "step": 514, + "time_per_iteration": 2.971573829650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123143, + "balance_loss_mlp": 1.09267306, + "epoch": 0.0990765679107349, + "flos": 535548450816.0, + "grad_norm": 0.06764912074049458, + "language_loss": 0.95588082, + "learning_rate": 0.0009875430607045078, + "loss": 0.9671123, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.3046875, + "step": 515, + "time_per_iteration": 2.6630361080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108813, + "balance_loss_mlp": 1.08072746, + "epoch": 0.09926894959599845, + "flos": 587607740928.0, + "grad_norm": 0.06593749006245919, + "language_loss": 0.92788792, + "learning_rate": 0.000987473857686313, + "loss": 0.93897605, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.28076172, + "step": 516, + "time_per_iteration": 2.710068702697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121556, + "balance_loss_mlp": 1.09039485, + "epoch": 0.09946133128126203, + "flos": 640947409920.0, + "grad_norm": 0.08862761474564218, + "language_loss": 0.9451825, + "learning_rate": 0.0009874044654139824, + "loss": 0.95639801, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.3112793, + "step": 517, + "time_per_iteration": 2.729975461959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117034, + "balance_loss_mlp": 1.08520555, + "epoch": 0.09965371296652559, + "flos": 465546327552.0, + "grad_norm": 0.09157938746936445, + "language_loss": 0.9250825, + "learning_rate": 0.0009873348839144563, + "loss": 0.93625283, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.31811523, + "step": 518, + "time_per_iteration": 2.5117127895355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112516, + "balance_loss_mlp": 1.09540534, + "epoch": 0.09984609465178915, + "flos": 483365505024.0, + "grad_norm": 0.07736257304557469, + "language_loss": 0.9674046, + "learning_rate": 0.000987265113214749, + "loss": 0.97865617, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.29711914, + "step": 519, + "time_per_iteration": 2.5816774368286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147544, + "balance_loss_mlp": 1.11421299, + "epoch": 0.1000384763370527, + "flos": 568764260352.0, + "grad_norm": 0.08763817133734854, + "language_loss": 0.96583092, + "learning_rate": 0.0009871951533419476, + "loss": 0.97730637, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.33325195, + "step": 520, + "time_per_iteration": 2.638664484024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140597, + "balance_loss_mlp": 1.108482, + "epoch": 0.10023085802231628, + "flos": 545515057152.0, + "grad_norm": 0.10925869968591369, + "language_loss": 0.88377398, + "learning_rate": 0.0009871250043232132, + "loss": 0.89517999, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.32104492, + "step": 521, + "time_per_iteration": 2.70491886138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136934, + "balance_loss_mlp": 1.10555792, + "epoch": 0.10042323970757984, + "flos": 503208557568.0, + "grad_norm": 0.07694864026409119, + "language_loss": 0.87725985, + "learning_rate": 0.0009870546661857797, + "loss": 0.8886292, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.31347656, + "step": 522, + "time_per_iteration": 2.653456211090088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126678, + "balance_loss_mlp": 1.09380031, + "epoch": 0.1006156213928434, + "flos": 770084402688.0, + "grad_norm": 0.08414569380370593, + "language_loss": 0.95787346, + "learning_rate": 0.0009869841389569553, + "loss": 0.96914017, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.32885742, + "step": 523, + "time_per_iteration": 2.9442663192749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116557, + "balance_loss_mlp": 1.08625388, + "epoch": 0.10080800307810696, + "flos": 489786338304.0, + "grad_norm": 0.06587351152736676, + "language_loss": 0.88897854, + "learning_rate": 0.0009869134226641206, + "loss": 0.90014416, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.30297852, + "step": 524, + "time_per_iteration": 2.5559868812561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110225, + "balance_loss_mlp": 1.07746601, + "epoch": 0.10100038476337053, + "flos": 454478252544.0, + "grad_norm": 0.09167866019985617, + "language_loss": 0.88383424, + "learning_rate": 0.0009868425173347303, + "loss": 0.89493656, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.32788086, + "step": 525, + "time_per_iteration": 2.645116090774536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111349, + "balance_loss_mlp": 1.08216143, + "epoch": 0.10119276644863409, + "flos": 556155348480.0, + "grad_norm": 0.07288604326691553, + "language_loss": 0.96749896, + "learning_rate": 0.0009867714229963125, + "loss": 0.97863394, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.31323242, + "step": 526, + "time_per_iteration": 2.730703592300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106354, + "balance_loss_mlp": 1.07540703, + "epoch": 0.10138514813389765, + "flos": 515990587392.0, + "grad_norm": 0.07095113284061857, + "language_loss": 0.93916923, + "learning_rate": 0.000986700139676468, + "loss": 0.95023274, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.30932617, + "step": 527, + "time_per_iteration": 2.5836338996887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110446, + "balance_loss_mlp": 1.07833052, + "epoch": 0.10157752981916121, + "flos": 500323322880.0, + "grad_norm": 0.06933811905919615, + "language_loss": 0.91673893, + "learning_rate": 0.0009866286674028717, + "loss": 0.92784333, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.32104492, + "step": 528, + "time_per_iteration": 2.7084739208221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101831, + "balance_loss_mlp": 1.07100391, + "epoch": 0.10176991150442478, + "flos": 656444376576.0, + "grad_norm": 0.07189407365130172, + "language_loss": 0.88586026, + "learning_rate": 0.0009865570062032717, + "loss": 0.8968786, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.30810547, + "step": 529, + "time_per_iteration": 2.9141628742218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103952, + "balance_loss_mlp": 1.07443571, + "epoch": 0.10196229318968834, + "flos": 572974953984.0, + "grad_norm": 0.06841647032337263, + "language_loss": 0.93659967, + "learning_rate": 0.0009864851561054893, + "loss": 0.94763923, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.29516602, + "step": 530, + "time_per_iteration": 2.7539894580841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090977, + "balance_loss_mlp": 1.06110358, + "epoch": 0.1021546748749519, + "flos": 517946061312.0, + "grad_norm": 0.07340246055426732, + "language_loss": 0.91722125, + "learning_rate": 0.0009864131171374191, + "loss": 0.92813098, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.29882812, + "step": 531, + "time_per_iteration": 2.6921956539154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109944, + "balance_loss_mlp": 1.06749225, + "epoch": 0.10234705656021546, + "flos": 609470286336.0, + "grad_norm": 0.07867637119915549, + "language_loss": 0.91107762, + "learning_rate": 0.0009863408893270292, + "loss": 0.92207205, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.31933594, + "step": 532, + "time_per_iteration": 2.7911570072174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106913, + "balance_loss_mlp": 1.07396317, + "epoch": 0.10253943824547904, + "flos": 601473710592.0, + "grad_norm": 0.08191923529880715, + "language_loss": 0.86522454, + "learning_rate": 0.0009862684727023605, + "loss": 0.87629366, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.3293457, + "step": 533, + "time_per_iteration": 2.7452800273895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105875, + "balance_loss_mlp": 1.07466602, + "epoch": 0.1027318199307426, + "flos": 662647011840.0, + "grad_norm": 0.07282647554851075, + "language_loss": 0.90315968, + "learning_rate": 0.0009861958672915283, + "loss": 0.91421843, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.31201172, + "step": 534, + "time_per_iteration": 2.8041269779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096602, + "balance_loss_mlp": 1.0673244, + "epoch": 0.10292420161600616, + "flos": 682962928128.0, + "grad_norm": 0.058349855756870184, + "language_loss": 0.90126884, + "learning_rate": 0.0009861230731227201, + "loss": 0.9122349, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.29248047, + "step": 535, + "time_per_iteration": 2.8627805709838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108033, + "balance_loss_mlp": 1.07615674, + "epoch": 0.10311658330126972, + "flos": 490042414080.0, + "grad_norm": 0.091555564896082, + "language_loss": 0.91954774, + "learning_rate": 0.0009860500902241973, + "loss": 0.93062806, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.31884766, + "step": 536, + "time_per_iteration": 2.6052157878875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120335, + "balance_loss_mlp": 1.08800602, + "epoch": 0.10330896498653329, + "flos": 431508446208.0, + "grad_norm": 0.0585767653270487, + "language_loss": 0.96574026, + "learning_rate": 0.0009859769186242942, + "loss": 0.97694361, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.32324219, + "step": 537, + "time_per_iteration": 2.51180362701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116517, + "balance_loss_mlp": 1.08571362, + "epoch": 0.10350134667179685, + "flos": 549330052608.0, + "grad_norm": 0.0744119924563098, + "language_loss": 0.8926785, + "learning_rate": 0.0009859035583514187, + "loss": 0.90384364, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.30834961, + "step": 538, + "time_per_iteration": 2.6369993686676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146613, + "balance_loss_mlp": 1.11380613, + "epoch": 0.10369372835706041, + "flos": 640327569408.0, + "grad_norm": 0.09976070350989504, + "language_loss": 0.90389431, + "learning_rate": 0.0009858300094340517, + "loss": 0.91536051, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.328125, + "step": 539, + "time_per_iteration": 2.7695086002349854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150737, + "balance_loss_mlp": 1.11838388, + "epoch": 0.10388611004232397, + "flos": 521500598784.0, + "grad_norm": 0.08771902350159133, + "language_loss": 0.85304511, + "learning_rate": 0.0009857562719007473, + "loss": 0.8645525, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.32324219, + "step": 540, + "time_per_iteration": 2.59881329536438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144681, + "balance_loss_mlp": 1.11320961, + "epoch": 0.10407849172758753, + "flos": 702111946752.0, + "grad_norm": 0.07496368213999542, + "language_loss": 0.88249481, + "learning_rate": 0.0009856823457801331, + "loss": 0.89394164, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.31494141, + "step": 541, + "time_per_iteration": 2.873481035232544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119735, + "balance_loss_mlp": 1.08738184, + "epoch": 0.1042708734128511, + "flos": 502652736000.0, + "grad_norm": 0.06973546911765124, + "language_loss": 0.94998306, + "learning_rate": 0.00098560823110091, + "loss": 0.96118045, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.32373047, + "step": 542, + "time_per_iteration": 2.661374807357788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118044, + "balance_loss_mlp": 1.08757377, + "epoch": 0.10446325509811466, + "flos": 485331153408.0, + "grad_norm": 0.0792045331206184, + "language_loss": 0.95517921, + "learning_rate": 0.000985533927891851, + "loss": 0.96635967, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.30419922, + "step": 543, + "time_per_iteration": 2.7264697551727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096256, + "balance_loss_mlp": 1.06502366, + "epoch": 0.10465563678337822, + "flos": 568365590016.0, + "grad_norm": 0.0919664039836503, + "language_loss": 0.93718112, + "learning_rate": 0.0009854594361818044, + "loss": 0.94814372, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.31201172, + "step": 544, + "time_per_iteration": 2.6869821548461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099422, + "balance_loss_mlp": 1.0683322, + "epoch": 0.10484801846864178, + "flos": 625806853632.0, + "grad_norm": 0.1054615502202609, + "language_loss": 0.927598, + "learning_rate": 0.0009853847559996897, + "loss": 0.9385922, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.31103516, + "step": 545, + "time_per_iteration": 2.7953526973724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100313, + "balance_loss_mlp": 1.06772113, + "epoch": 0.10504040015390535, + "flos": 743063874048.0, + "grad_norm": 0.0768702593450629, + "language_loss": 0.92008656, + "learning_rate": 0.0009853098873745, + "loss": 0.93108964, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.32592773, + "step": 546, + "time_per_iteration": 3.0344293117523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106321, + "balance_loss_mlp": 1.07430172, + "epoch": 0.10523278183916891, + "flos": 586382616576.0, + "grad_norm": 0.072035501246702, + "language_loss": 0.90983582, + "learning_rate": 0.0009852348303353027, + "loss": 0.92089903, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.32006836, + "step": 547, + "time_per_iteration": 2.7647972106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110403, + "balance_loss_mlp": 1.07100892, + "epoch": 0.10542516352443247, + "flos": 869270552064.0, + "grad_norm": 0.07817580313906373, + "language_loss": 0.84611928, + "learning_rate": 0.000985159584911237, + "loss": 0.85715961, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.33007812, + "step": 548, + "time_per_iteration": 3.143122434616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104478, + "balance_loss_mlp": 1.07212472, + "epoch": 0.10561754520969603, + "flos": 505182970368.0, + "grad_norm": 0.08898596974063745, + "language_loss": 0.91126573, + "learning_rate": 0.0009850841511315162, + "loss": 0.92231047, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.32348633, + "step": 549, + "time_per_iteration": 2.6164846420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112982, + "balance_loss_mlp": 1.07946038, + "epoch": 0.1058099268949596, + "flos": 559690947072.0, + "grad_norm": 0.06224197989448247, + "language_loss": 0.92054999, + "learning_rate": 0.0009850085290254256, + "loss": 0.93167984, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.33520508, + "step": 550, + "time_per_iteration": 2.7473480701446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110676, + "balance_loss_mlp": 1.07431078, + "epoch": 0.10600230858022316, + "flos": 561773048832.0, + "grad_norm": 0.05678957528127819, + "language_loss": 0.88957977, + "learning_rate": 0.0009849327186223246, + "loss": 0.90064728, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.32446289, + "step": 551, + "time_per_iteration": 2.805126905441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094878, + "balance_loss_mlp": 1.06297779, + "epoch": 0.10619469026548672, + "flos": 494079989760.0, + "grad_norm": 0.07906939671673464, + "language_loss": 0.95596325, + "learning_rate": 0.000984856719951646, + "loss": 0.96691203, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.31860352, + "step": 552, + "time_per_iteration": 2.5688273906707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105536, + "balance_loss_mlp": 1.07370734, + "epoch": 0.10638707195075028, + "flos": 675843678720.0, + "grad_norm": 0.06469368191660979, + "language_loss": 0.93170857, + "learning_rate": 0.0009847805330428943, + "loss": 0.94276392, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.31811523, + "step": 553, + "time_per_iteration": 2.8858227729797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105116, + "balance_loss_mlp": 1.07080746, + "epoch": 0.10657945363601386, + "flos": 487811925504.0, + "grad_norm": 0.07365688544553677, + "language_loss": 0.94454086, + "learning_rate": 0.0009847041579256481, + "loss": 0.95559192, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.34326172, + "step": 554, + "time_per_iteration": 2.5912039279937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114154, + "balance_loss_mlp": 1.08158636, + "epoch": 0.10677183532127742, + "flos": 482706376704.0, + "grad_norm": 0.06731486395760358, + "language_loss": 0.95310724, + "learning_rate": 0.0009846275946295592, + "loss": 0.96424878, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.32568359, + "step": 555, + "time_per_iteration": 2.6071619987487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120557, + "balance_loss_mlp": 1.08755958, + "epoch": 0.10696421700654098, + "flos": 655917668352.0, + "grad_norm": 0.06239681935918944, + "language_loss": 0.88169777, + "learning_rate": 0.0009845508431843518, + "loss": 0.89290333, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.32983398, + "step": 556, + "time_per_iteration": 2.9906973838806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122479, + "balance_loss_mlp": 1.08986306, + "epoch": 0.10715659869180454, + "flos": 567483881472.0, + "grad_norm": 0.06803394611182671, + "language_loss": 0.89010829, + "learning_rate": 0.0009844739036198233, + "loss": 0.90133309, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.32592773, + "step": 557, + "time_per_iteration": 2.6462793350219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113246, + "balance_loss_mlp": 1.09927225, + "epoch": 0.10734898037706811, + "flos": 540432829440.0, + "grad_norm": 0.0683091886411484, + "language_loss": 0.96000761, + "learning_rate": 0.0009843967759658448, + "loss": 0.97133219, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.33203125, + "step": 558, + "time_per_iteration": 2.664320707321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369087, + "balance_loss_mlp": 1.3546865, + "epoch": 0.10754136206233167, + "flos": 1475870008320.0, + "grad_norm": 0.12144998025248735, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.74136841, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.14355469, + "step": 559, + "time_per_iteration": 4.836310148239136 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124853, + "balance_loss_mlp": 1.0925231, + "epoch": 0.10773374374759523, + "flos": 512155243008.0, + "grad_norm": 0.06725764235558847, + "language_loss": 0.96045369, + "learning_rate": 0.000984241956509384, + "loss": 0.97170222, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.32324219, + "step": 560, + "time_per_iteration": 2.7409372329711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134795, + "balance_loss_mlp": 1.10005689, + "epoch": 0.10792612543285879, + "flos": 496261016064.0, + "grad_norm": 0.08502468521942065, + "language_loss": 0.91520619, + "learning_rate": 0.0009841642647670078, + "loss": 0.92655414, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.34741211, + "step": 561, + "time_per_iteration": 2.5360167026519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134435, + "balance_loss_mlp": 1.10050821, + "epoch": 0.10811850711812235, + "flos": 735131317248.0, + "grad_norm": 0.08550854990342285, + "language_loss": 0.86122006, + "learning_rate": 0.0009840863850553944, + "loss": 0.87256444, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.33911133, + "step": 562, + "time_per_iteration": 3.0013930797576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118751, + "balance_loss_mlp": 1.08604038, + "epoch": 0.10831088880338592, + "flos": 611257024512.0, + "grad_norm": 0.07414056330929218, + "language_loss": 0.92513216, + "learning_rate": 0.0009840083174047782, + "loss": 0.93631971, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.3269043, + "step": 563, + "time_per_iteration": 2.761746883392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125103, + "balance_loss_mlp": 1.09353685, + "epoch": 0.10850327048864948, + "flos": 556022928384.0, + "grad_norm": 0.06849160846851732, + "language_loss": 0.86520386, + "learning_rate": 0.0009839300618454685, + "loss": 0.87645483, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.31518555, + "step": 564, + "time_per_iteration": 2.833545684814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124691, + "balance_loss_mlp": 1.09291005, + "epoch": 0.10869565217391304, + "flos": 602902476288.0, + "grad_norm": 0.06688991061359367, + "language_loss": 0.92471159, + "learning_rate": 0.0009838516184078466, + "loss": 0.9359585, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.31762695, + "step": 565, + "time_per_iteration": 2.838482618331909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112559, + "balance_loss_mlp": 1.09345102, + "epoch": 0.1088880338591766, + "flos": 525922288128.0, + "grad_norm": 0.08266802783800845, + "language_loss": 0.89073956, + "learning_rate": 0.0009837729871223669, + "loss": 0.90199542, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.3215332, + "step": 566, + "time_per_iteration": 2.6670589447021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134729, + "balance_loss_mlp": 1.10073042, + "epoch": 0.10908041554444017, + "flos": 619986921984.0, + "grad_norm": 0.06816497946354988, + "language_loss": 0.89503658, + "learning_rate": 0.0009836941680195568, + "loss": 0.90638387, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.34033203, + "step": 567, + "time_per_iteration": 2.7894582748413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131677, + "balance_loss_mlp": 1.09691525, + "epoch": 0.10927279722970373, + "flos": 897740195328.0, + "grad_norm": 0.07371226629870802, + "language_loss": 0.8534497, + "learning_rate": 0.0009836151611300166, + "loss": 0.86476642, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.34765625, + "step": 568, + "time_per_iteration": 3.204500913619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116051, + "balance_loss_mlp": 1.08467555, + "epoch": 0.10946517891496729, + "flos": 528408852480.0, + "grad_norm": 0.061952855977424344, + "language_loss": 0.96103537, + "learning_rate": 0.0009835359664844194, + "loss": 0.97219586, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.3137207, + "step": 569, + "time_per_iteration": 2.6154720783233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124163, + "balance_loss_mlp": 1.11014414, + "epoch": 0.10965756060023085, + "flos": 1559944714752.0, + "grad_norm": 0.03358522647050957, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82160974, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.140625, + "step": 570, + "time_per_iteration": 4.907090187072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112487, + "balance_loss_mlp": 1.09406638, + "epoch": 0.10984994228549443, + "flos": 512820163584.0, + "grad_norm": 0.08674533322611513, + "language_loss": 0.9339065, + "learning_rate": 0.0009833770140481118, + "loss": 0.9451552, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.30786133, + "step": 571, + "time_per_iteration": 2.694821357727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121358, + "balance_loss_mlp": 1.09072113, + "epoch": 0.11004232397075799, + "flos": 954314307072.0, + "grad_norm": 0.07582699316256973, + "language_loss": 0.84126109, + "learning_rate": 0.000983297256319112, + "loss": 0.85247469, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.30664062, + "step": 572, + "time_per_iteration": 3.208728313446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144326, + "balance_loss_mlp": 1.11097169, + "epoch": 0.11023470565602154, + "flos": 487921024512.0, + "grad_norm": 0.07530566153242002, + "language_loss": 0.8789041, + "learning_rate": 0.000983217310957477, + "loss": 0.89034736, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.33349609, + "step": 573, + "time_per_iteration": 2.7521331310272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113474, + "balance_loss_mlp": 1.1014812, + "epoch": 0.1104270873412851, + "flos": 655521970176.0, + "grad_norm": 0.08427122985019045, + "language_loss": 0.91161472, + "learning_rate": 0.000983137177994244, + "loss": 0.92296207, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.33300781, + "step": 574, + "time_per_iteration": 2.869795083999634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105984, + "balance_loss_mlp": 1.0752039, + "epoch": 0.11061946902654868, + "flos": 723097165824.0, + "grad_norm": 0.0803000190442887, + "language_loss": 0.87202144, + "learning_rate": 0.0009830568574605235, + "loss": 0.88308132, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.30737305, + "step": 575, + "time_per_iteration": 2.952505111694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111674, + "balance_loss_mlp": 1.07963109, + "epoch": 0.11081185071181224, + "flos": 835113397248.0, + "grad_norm": 0.07764025760375837, + "language_loss": 0.89234924, + "learning_rate": 0.0009829763493874992, + "loss": 0.90346599, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.3203125, + "step": 576, + "time_per_iteration": 3.0367727279663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110641, + "balance_loss_mlp": 1.07508206, + "epoch": 0.1110042323970758, + "flos": 608776252416.0, + "grad_norm": 0.06795308301133055, + "language_loss": 0.94366598, + "learning_rate": 0.0009828956538064264, + "loss": 0.95473009, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.31347656, + "step": 577, + "time_per_iteration": 2.783268928527832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091394, + "balance_loss_mlp": 1.0610671, + "epoch": 0.11119661408233936, + "flos": 595643604480.0, + "grad_norm": 0.0662915232098912, + "language_loss": 0.9183138, + "learning_rate": 0.0009828147707486344, + "loss": 0.92922771, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.30297852, + "step": 578, + "time_per_iteration": 2.6628670692443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092993, + "balance_loss_mlp": 1.06109214, + "epoch": 0.11138899576760293, + "flos": 555573385728.0, + "grad_norm": 0.07355059798421615, + "language_loss": 0.87444091, + "learning_rate": 0.0009827337002455245, + "loss": 0.88537085, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.31884766, + "step": 579, + "time_per_iteration": 2.616842031478882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087398, + "balance_loss_mlp": 1.05857313, + "epoch": 0.11158137745286649, + "flos": 689418667008.0, + "grad_norm": 0.05531737995895799, + "language_loss": 0.89474124, + "learning_rate": 0.0009826524423285712, + "loss": 0.90561521, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.28808594, + "step": 580, + "time_per_iteration": 2.896409749984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093471, + "balance_loss_mlp": 1.06393051, + "epoch": 0.11177375913813005, + "flos": 762688728576.0, + "grad_norm": 0.06807232662928764, + "language_loss": 0.9046967, + "learning_rate": 0.0009825709970293218, + "loss": 0.91563141, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.2956543, + "step": 581, + "time_per_iteration": 2.8843319416046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096208, + "balance_loss_mlp": 1.0669775, + "epoch": 0.11196614082339361, + "flos": 806211588096.0, + "grad_norm": 0.07053725402235117, + "language_loss": 0.96166003, + "learning_rate": 0.0009824893643793956, + "loss": 0.9726221, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.29248047, + "step": 582, + "time_per_iteration": 3.04577898979187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104715, + "balance_loss_mlp": 1.07305288, + "epoch": 0.11215852250865718, + "flos": 558350931456.0, + "grad_norm": 0.10752491555358674, + "language_loss": 0.89033759, + "learning_rate": 0.0009824075444104857, + "loss": 0.90138471, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.31689453, + "step": 583, + "time_per_iteration": 2.682020902633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125233, + "balance_loss_mlp": 1.09497714, + "epoch": 0.11235090419392074, + "flos": 513322140672.0, + "grad_norm": 0.06606619546840543, + "language_loss": 0.94941097, + "learning_rate": 0.000982325537154357, + "loss": 0.9606632, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.30224609, + "step": 584, + "time_per_iteration": 2.577632427215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122311, + "balance_loss_mlp": 1.09045827, + "epoch": 0.1125432858791843, + "flos": 491209311744.0, + "grad_norm": 0.07452844115700766, + "language_loss": 0.95190644, + "learning_rate": 0.0009822433426428484, + "loss": 0.96312958, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.31860352, + "step": 585, + "time_per_iteration": 2.560591220855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126565, + "balance_loss_mlp": 1.09280539, + "epoch": 0.11273566756444786, + "flos": 510476193792.0, + "grad_norm": 0.11434848401200806, + "language_loss": 0.87964213, + "learning_rate": 0.0009821609609078697, + "loss": 0.89090776, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.3371582, + "step": 586, + "time_per_iteration": 2.633925437927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118464, + "balance_loss_mlp": 1.08785152, + "epoch": 0.11292804924971142, + "flos": 622149009408.0, + "grad_norm": 0.08000190427267627, + "language_loss": 0.905334, + "learning_rate": 0.0009820783919814045, + "loss": 0.91651857, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.3059082, + "step": 587, + "time_per_iteration": 2.806704044342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111289, + "balance_loss_mlp": 1.07857847, + "epoch": 0.113120430934975, + "flos": 477811823616.0, + "grad_norm": 0.09357252991594707, + "language_loss": 0.83955467, + "learning_rate": 0.0009819956358955095, + "loss": 0.8506676, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.32714844, + "step": 588, + "time_per_iteration": 2.5903711318969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109097, + "balance_loss_mlp": 1.07455039, + "epoch": 0.11331281262023855, + "flos": 466801975296.0, + "grad_norm": 0.06610764616840299, + "language_loss": 0.85348701, + "learning_rate": 0.0009819126926823127, + "loss": 0.86457801, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.34570312, + "step": 589, + "time_per_iteration": 2.5726494789123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108183, + "balance_loss_mlp": 1.07535291, + "epoch": 0.11350519430550211, + "flos": 650164727808.0, + "grad_norm": 0.06035980490561805, + "language_loss": 0.87806922, + "learning_rate": 0.000981829562374016, + "loss": 0.8891511, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.328125, + "step": 590, + "time_per_iteration": 2.7960643768310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112987, + "balance_loss_mlp": 1.08041859, + "epoch": 0.11369757599076567, + "flos": 557547798528.0, + "grad_norm": 0.08830164474684658, + "language_loss": 0.98550045, + "learning_rate": 0.0009817462450028933, + "loss": 0.99663031, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.32568359, + "step": 591, + "time_per_iteration": 2.654860734939575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107596, + "balance_loss_mlp": 1.07526684, + "epoch": 0.11388995767602925, + "flos": 570774988800.0, + "grad_norm": 0.06245390963608315, + "language_loss": 0.86587834, + "learning_rate": 0.0009816627406012916, + "loss": 0.87695432, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.32348633, + "step": 592, + "time_per_iteration": 2.8017733097076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101808, + "balance_loss_mlp": 1.07074225, + "epoch": 0.1140823393612928, + "flos": 740069540352.0, + "grad_norm": 0.06581053360364857, + "language_loss": 0.8595314, + "learning_rate": 0.0009815790492016295, + "loss": 0.87054944, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.31030273, + "step": 593, + "time_per_iteration": 2.9602174758911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097875, + "balance_loss_mlp": 1.06666636, + "epoch": 0.11427472104655637, + "flos": 698694211584.0, + "grad_norm": 0.07124053574400792, + "language_loss": 0.87982339, + "learning_rate": 0.0009814951708363993, + "loss": 0.89080215, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.31201172, + "step": 594, + "time_per_iteration": 2.818460702896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167391, + "balance_loss_mlp": 1.15413451, + "epoch": 0.11446710273181993, + "flos": 1476387952128.0, + "grad_norm": 0.04038129773095179, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79158378, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.1328125, + "step": 595, + "time_per_iteration": 4.776912450790405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110386, + "balance_loss_mlp": 1.07250798, + "epoch": 0.1146594844170835, + "flos": 494641603584.0, + "grad_norm": 0.1404346857169784, + "language_loss": 0.89489102, + "learning_rate": 0.0009813268533395648, + "loss": 0.90592968, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.3137207, + "step": 596, + "time_per_iteration": 2.562816858291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115324, + "balance_loss_mlp": 1.08344746, + "epoch": 0.11485186610234706, + "flos": 474596319744.0, + "grad_norm": 0.07456374098915484, + "language_loss": 0.89145029, + "learning_rate": 0.0009812424142733073, + "loss": 0.90260351, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.31884766, + "step": 597, + "time_per_iteration": 2.5198655128479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123971, + "balance_loss_mlp": 1.0946219, + "epoch": 0.11504424778761062, + "flos": 730858014720.0, + "grad_norm": 0.05033183127205697, + "language_loss": 0.86898923, + "learning_rate": 0.000981157788372175, + "loss": 0.88022888, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.29345703, + "step": 598, + "time_per_iteration": 3.004558563232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155063, + "balance_loss_mlp": 1.12290049, + "epoch": 0.11523662947287418, + "flos": 545539788288.0, + "grad_norm": 0.07554757352201513, + "language_loss": 0.90216064, + "learning_rate": 0.0009810729756690223, + "loss": 0.91371131, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.3215332, + "step": 599, + "time_per_iteration": 2.7165520191192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149643, + "balance_loss_mlp": 1.11790919, + "epoch": 0.11542901115813775, + "flos": 774737436672.0, + "grad_norm": 0.08801397326806587, + "language_loss": 0.92855275, + "learning_rate": 0.0009809879761967766, + "loss": 0.94004917, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.31738281, + "step": 600, + "time_per_iteration": 2.9548492431640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115619, + "balance_loss_mlp": 1.12004542, + "epoch": 0.11562139284340131, + "flos": 730585972224.0, + "grad_norm": 0.08285308963026158, + "language_loss": 0.87716347, + "learning_rate": 0.0009809027899884378, + "loss": 0.8887254, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.36157227, + "step": 601, + "time_per_iteration": 2.9107346534729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131924, + "balance_loss_mlp": 1.10085821, + "epoch": 0.11581377452866487, + "flos": 535589148672.0, + "grad_norm": 0.07059046613839054, + "language_loss": 0.89834028, + "learning_rate": 0.0009808174170770779, + "loss": 0.90965956, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.31079102, + "step": 602, + "time_per_iteration": 2.79127836227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217718, + "balance_loss_mlp": 1.20541608, + "epoch": 0.11600615621392843, + "flos": 1554968613888.0, + "grad_norm": 0.07528653751738872, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86115962, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.12304688, + "step": 603, + "time_per_iteration": 4.862261772155762 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103115, + "balance_loss_mlp": 1.07238269, + "epoch": 0.116198537899192, + "flos": 537178037760.0, + "grad_norm": 0.08106568577848162, + "language_loss": 0.94434869, + "learning_rate": 0.0009806461112779462, + "loss": 0.95537978, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.30737305, + "step": 604, + "time_per_iteration": 2.600008249282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097342, + "balance_loss_mlp": 1.06427336, + "epoch": 0.11639091958445556, + "flos": 453970483200.0, + "grad_norm": 0.09761910402267754, + "language_loss": 0.89590895, + "learning_rate": 0.0009805601784566814, + "loss": 0.90688241, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.33056641, + "step": 605, + "time_per_iteration": 2.4687013626098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097807, + "balance_loss_mlp": 1.06635928, + "epoch": 0.11658330126971912, + "flos": 554815332864.0, + "grad_norm": 0.0628453025897625, + "language_loss": 0.96235836, + "learning_rate": 0.0009804740590654089, + "loss": 0.97333646, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.31469727, + "step": 606, + "time_per_iteration": 2.654134750366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109435, + "balance_loss_mlp": 1.0789417, + "epoch": 0.11677568295498268, + "flos": 716025968640.0, + "grad_norm": 0.07837472156111998, + "language_loss": 0.90884066, + "learning_rate": 0.0009803877531375635, + "loss": 0.91993499, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.30493164, + "step": 607, + "time_per_iteration": 2.825778007507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112529, + "balance_loss_mlp": 1.08074808, + "epoch": 0.11696806464024626, + "flos": 609474668544.0, + "grad_norm": 0.07263848878870109, + "language_loss": 0.91923869, + "learning_rate": 0.0009803012607066523, + "loss": 0.93036401, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.31787109, + "step": 608, + "time_per_iteration": 2.721005916595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101062, + "balance_loss_mlp": 1.06980491, + "epoch": 0.11716044632550981, + "flos": 520127087616.0, + "grad_norm": 0.06980646294906427, + "language_loss": 0.9077962, + "learning_rate": 0.0009802145818062543, + "loss": 0.91880679, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.31225586, + "step": 609, + "time_per_iteration": 2.707643985748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102401, + "balance_loss_mlp": 1.07035792, + "epoch": 0.11735282801077337, + "flos": 507246133248.0, + "grad_norm": 0.07162886221417876, + "language_loss": 0.9293434, + "learning_rate": 0.0009801277164700212, + "loss": 0.9403674, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.3203125, + "step": 610, + "time_per_iteration": 2.6389639377593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094537, + "balance_loss_mlp": 1.06323278, + "epoch": 0.11754520969603693, + "flos": 686339965440.0, + "grad_norm": 0.07220465483683103, + "language_loss": 0.90727574, + "learning_rate": 0.0009800406647316776, + "loss": 0.91822106, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.31274414, + "step": 611, + "time_per_iteration": 2.8033382892608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066854, + "balance_loss_mlp": 1.05369329, + "epoch": 0.1177375913813005, + "flos": 1541673022464.0, + "grad_norm": 0.030783707978337852, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.77981311, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.13183594, + "step": 612, + "time_per_iteration": 4.777275562286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116404, + "balance_loss_mlp": 1.08307314, + "epoch": 0.11792997306656407, + "flos": 520269682176.0, + "grad_norm": 0.07589987368124408, + "language_loss": 0.8961159, + "learning_rate": 0.000979866002183916, + "loss": 0.90727997, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.33325195, + "step": 613, + "time_per_iteration": 2.6848883628845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109453, + "balance_loss_mlp": 1.07719529, + "epoch": 0.11812235475182763, + "flos": 665980379136.0, + "grad_norm": 0.08667718058784188, + "language_loss": 0.91197205, + "learning_rate": 0.0009797783914423082, + "loss": 0.92306662, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.32250977, + "step": 614, + "time_per_iteration": 2.832414388656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105189, + "balance_loss_mlp": 1.07140493, + "epoch": 0.11831473643709119, + "flos": 621021399552.0, + "grad_norm": 0.06050640051516142, + "language_loss": 0.85425436, + "learning_rate": 0.0009796905944342094, + "loss": 0.86530626, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.33813477, + "step": 615, + "time_per_iteration": 2.8220455646514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112849, + "balance_loss_mlp": 1.07913685, + "epoch": 0.11850711812235475, + "flos": 456438108672.0, + "grad_norm": 0.0714748534502384, + "language_loss": 0.893188, + "learning_rate": 0.0009796026111937057, + "loss": 0.90431643, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.3371582, + "step": 616, + "time_per_iteration": 2.590566873550415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102524, + "balance_loss_mlp": 1.07005119, + "epoch": 0.11869949980761832, + "flos": 513598565376.0, + "grad_norm": 0.06492309219220607, + "language_loss": 0.89778733, + "learning_rate": 0.0009795144417549552, + "loss": 0.90881252, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.32470703, + "step": 617, + "time_per_iteration": 2.672914505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109626, + "balance_loss_mlp": 1.0773685, + "epoch": 0.11889188149288188, + "flos": 534732171264.0, + "grad_norm": 0.057544425945024125, + "language_loss": 0.90660846, + "learning_rate": 0.0009794260861521883, + "loss": 0.9177047, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.32250977, + "step": 618, + "time_per_iteration": 2.817354202270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102516, + "balance_loss_mlp": 1.07009149, + "epoch": 0.11908426317814544, + "flos": 498344527872.0, + "grad_norm": 0.0773697745436404, + "language_loss": 0.87738883, + "learning_rate": 0.0009793375444197075, + "loss": 0.88841403, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.32397461, + "step": 619, + "time_per_iteration": 2.607475996017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011109, + "balance_loss_mlp": 1.07697332, + "epoch": 0.119276644863409, + "flos": 659598833664.0, + "grad_norm": 0.06767977381214116, + "language_loss": 0.86337721, + "learning_rate": 0.000979248816591888, + "loss": 0.87448615, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.33935547, + "step": 620, + "time_per_iteration": 2.758866548538208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098794, + "balance_loss_mlp": 1.06667948, + "epoch": 0.11946902654867257, + "flos": 758396487168.0, + "grad_norm": 0.06819106164994826, + "language_loss": 0.87032986, + "learning_rate": 0.0009791599027031766, + "loss": 0.88131785, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.32128906, + "step": 621, + "time_per_iteration": 3.029431104660034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088156, + "balance_loss_mlp": 1.05611241, + "epoch": 0.11966140823393613, + "flos": 680697533952.0, + "grad_norm": 0.0732554324646167, + "language_loss": 0.87112588, + "learning_rate": 0.0009790708027880932, + "loss": 0.88200748, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.32055664, + "step": 622, + "time_per_iteration": 2.855576992034912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056461, + "balance_loss_mlp": 1.04444504, + "epoch": 0.11985378991919969, + "flos": 1450268070912.0, + "grad_norm": 0.03732324883573809, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78483754, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.12011719, + "step": 623, + "time_per_iteration": 4.840993165969849 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108671, + "balance_loss_mlp": 1.0551914, + "epoch": 0.12004617160446325, + "flos": 527586780672.0, + "grad_norm": 0.07309096746678648, + "language_loss": 0.94236648, + "learning_rate": 0.0009788920450172487, + "loss": 0.9532336, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.31518555, + "step": 624, + "time_per_iteration": 2.6301677227020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102663, + "balance_loss_mlp": 1.07023823, + "epoch": 0.12023855328972682, + "flos": 473980861440.0, + "grad_norm": 0.15739190650861204, + "language_loss": 0.91515559, + "learning_rate": 0.0009788023872308875, + "loss": 0.92618221, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.32421875, + "step": 625, + "time_per_iteration": 2.506446361541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014454, + "balance_loss_mlp": 1.0033915, + "epoch": 0.12043093497499038, + "flos": 1530954155520.0, + "grad_norm": 0.02216054665264375, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76443458, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.11083984, + "step": 626, + "time_per_iteration": 4.713289260864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114644, + "balance_loss_mlp": 1.08391225, + "epoch": 0.12062331666025394, + "flos": 539571469824.0, + "grad_norm": 0.0672242080300053, + "language_loss": 0.94766486, + "learning_rate": 0.0009786225140303285, + "loss": 0.95881128, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.30761719, + "step": 627, + "time_per_iteration": 2.61875057220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011267, + "balance_loss_mlp": 1.09503818, + "epoch": 0.1208156983455175, + "flos": 511634327040.0, + "grad_norm": 0.06510849521455, + "language_loss": 0.925771, + "learning_rate": 0.0009785322986859634, + "loss": 0.93703806, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.31640625, + "step": 628, + "time_per_iteration": 2.6567625999450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141777, + "balance_loss_mlp": 1.11059177, + "epoch": 0.12100808003078108, + "flos": 596195043840.0, + "grad_norm": 0.06735600063735754, + "language_loss": 0.93719506, + "learning_rate": 0.0009784418975588838, + "loss": 0.94861281, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.31152344, + "step": 629, + "time_per_iteration": 2.697376012802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122983, + "balance_loss_mlp": 1.09222674, + "epoch": 0.12120046171604464, + "flos": 522698019840.0, + "grad_norm": 0.47103484407124013, + "language_loss": 0.93927598, + "learning_rate": 0.0009783513106841862, + "loss": 0.95050573, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.30761719, + "step": 630, + "time_per_iteration": 2.7226808071136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143332, + "balance_loss_mlp": 1.13179243, + "epoch": 0.1213928434013082, + "flos": 1553605277184.0, + "grad_norm": 0.056788624646596834, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77876031, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.11523438, + "step": 631, + "time_per_iteration": 4.948111295700073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228128, + "balance_loss_mlp": 1.19219875, + "epoch": 0.12158522508657175, + "flos": 495143580672.0, + "grad_norm": 0.06834333100250278, + "language_loss": 0.88515621, + "learning_rate": 0.0009781695798326854, + "loss": 0.89743745, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.35961914, + "step": 632, + "time_per_iteration": 2.5616555213928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267845, + "balance_loss_mlp": 1.23050833, + "epoch": 0.12177760677183531, + "flos": 475335433728.0, + "grad_norm": 0.1009303482431908, + "language_loss": 0.88543177, + "learning_rate": 0.0009780784359264365, + "loss": 0.89811015, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.37329102, + "step": 633, + "time_per_iteration": 2.597935438156128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265484, + "balance_loss_mlp": 1.25370574, + "epoch": 0.12196998845709889, + "flos": 1467630351360.0, + "grad_norm": 0.08843071113371018, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75454181, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.11767578, + "step": 634, + "time_per_iteration": 4.768415451049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235432, + "balance_loss_mlp": 1.19976473, + "epoch": 0.12216237014236245, + "flos": 586279309824.0, + "grad_norm": 0.0829698455775257, + "language_loss": 0.88074899, + "learning_rate": 0.000977895591329867, + "loss": 0.89310336, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.35668945, + "step": 635, + "time_per_iteration": 2.7918457984924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214994, + "balance_loss_mlp": 1.17720437, + "epoch": 0.12235475182762601, + "flos": 597721324032.0, + "grad_norm": 0.0916527997361875, + "language_loss": 0.87791145, + "learning_rate": 0.000977803890710533, + "loss": 0.89006138, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.37792969, + "step": 636, + "time_per_iteration": 2.7248313426971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186705, + "balance_loss_mlp": 1.1509428, + "epoch": 0.12254713351288957, + "flos": 497487550464.0, + "grad_norm": 0.0702522126388857, + "language_loss": 0.93856937, + "learning_rate": 0.0009777120045912774, + "loss": 0.95043641, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.35766602, + "step": 637, + "time_per_iteration": 2.6079726219177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180704, + "balance_loss_mlp": 1.14236617, + "epoch": 0.12273951519815314, + "flos": 605565130752.0, + "grad_norm": 0.06645311005239844, + "language_loss": 0.90599251, + "learning_rate": 0.0009776199330077736, + "loss": 0.91779959, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.38330078, + "step": 638, + "time_per_iteration": 2.7671282291412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196025, + "balance_loss_mlp": 1.15940344, + "epoch": 0.1229318968834167, + "flos": 597578729472.0, + "grad_norm": 0.09015200479441979, + "language_loss": 0.93140519, + "learning_rate": 0.0009775276759957667, + "loss": 0.94336545, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.36621094, + "step": 639, + "time_per_iteration": 2.6990442276000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179898, + "balance_loss_mlp": 1.14265716, + "epoch": 0.12312427856868026, + "flos": 678082931712.0, + "grad_norm": 0.08188642922116089, + "language_loss": 0.90714514, + "learning_rate": 0.0009774352335910745, + "loss": 0.91894412, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.37280273, + "step": 640, + "time_per_iteration": 2.7950265407562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115004, + "balance_loss_mlp": 1.11658967, + "epoch": 0.12331666025394382, + "flos": 608656978944.0, + "grad_norm": 0.07361380744806716, + "language_loss": 0.95549798, + "learning_rate": 0.000977342605829586, + "loss": 0.96699834, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.3347168, + "step": 641, + "time_per_iteration": 2.6966538429260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140018, + "balance_loss_mlp": 1.10497069, + "epoch": 0.12350904193920739, + "flos": 762172194816.0, + "grad_norm": 0.08211004604029591, + "language_loss": 0.86708105, + "learning_rate": 0.0009772497927472623, + "loss": 0.87848121, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.35083008, + "step": 642, + "time_per_iteration": 3.050595998764038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121507, + "balance_loss_mlp": 1.0852437, + "epoch": 0.12370142362447095, + "flos": 540699079680.0, + "grad_norm": 0.0716743258864478, + "language_loss": 0.85363436, + "learning_rate": 0.0009771567943801368, + "loss": 0.86484945, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.36254883, + "step": 643, + "time_per_iteration": 2.627019166946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112067, + "balance_loss_mlp": 1.07744884, + "epoch": 0.12389380530973451, + "flos": 547848852480.0, + "grad_norm": 0.06992166814052157, + "language_loss": 0.89936745, + "learning_rate": 0.0009770636107643152, + "loss": 0.91048813, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.34643555, + "step": 644, + "time_per_iteration": 2.696233034133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102963, + "balance_loss_mlp": 1.06846356, + "epoch": 0.12408618699499807, + "flos": 540048715776.0, + "grad_norm": 0.06268128655507912, + "language_loss": 0.88181639, + "learning_rate": 0.0009769702419359738, + "loss": 0.89284605, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.3449707, + "step": 645, + "time_per_iteration": 2.61401104927063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116268, + "balance_loss_mlp": 1.0810535, + "epoch": 0.12427856868026164, + "flos": 745451513856.0, + "grad_norm": 0.07610574883038115, + "language_loss": 0.89730537, + "learning_rate": 0.000976876687931362, + "loss": 0.90846807, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.35229492, + "step": 646, + "time_per_iteration": 2.999408721923828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131315, + "balance_loss_mlp": 1.09622002, + "epoch": 0.1244709503655252, + "flos": 533460556800.0, + "grad_norm": 0.19449531308307466, + "language_loss": 0.85410094, + "learning_rate": 0.0009767829487868005, + "loss": 0.86541414, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.35107422, + "step": 647, + "time_per_iteration": 2.617666721343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138117, + "balance_loss_mlp": 1.10159075, + "epoch": 0.12466333205078876, + "flos": 507847034880.0, + "grad_norm": 0.07509451505155453, + "language_loss": 0.89358151, + "learning_rate": 0.000976689024538682, + "loss": 0.90496266, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.36499023, + "step": 648, + "time_per_iteration": 2.5929009914398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138062, + "balance_loss_mlp": 1.10110736, + "epoch": 0.12485571373605232, + "flos": 681023420928.0, + "grad_norm": 0.07057439208121223, + "language_loss": 0.87662494, + "learning_rate": 0.0009765949152234716, + "loss": 0.8880055, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.36962891, + "step": 649, + "time_per_iteration": 2.874701976776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147728, + "balance_loss_mlp": 1.13504386, + "epoch": 0.1250480954213159, + "flos": 1329402668544.0, + "grad_norm": 0.04527818124304351, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79833812, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.12695312, + "step": 650, + "time_per_iteration": 4.680933713912964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138039, + "balance_loss_mlp": 1.10287213, + "epoch": 0.12524047710657946, + "flos": 938140683264.0, + "grad_norm": 0.08375968037938068, + "language_loss": 0.82443976, + "learning_rate": 0.0009764061415379919, + "loss": 0.83582014, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.35205078, + "step": 651, + "time_per_iteration": 3.2550604343414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135369, + "balance_loss_mlp": 1.09774697, + "epoch": 0.12543285879184302, + "flos": 513642235392.0, + "grad_norm": 0.07146085627000143, + "language_loss": 0.89363486, + "learning_rate": 0.0009763114772410109, + "loss": 0.90498853, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.3762207, + "step": 652, + "time_per_iteration": 2.5937142372131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139745, + "balance_loss_mlp": 1.10419679, + "epoch": 0.12562524047710658, + "flos": 717991617024.0, + "grad_norm": 0.07913079577836896, + "language_loss": 0.87230957, + "learning_rate": 0.0009762166280235146, + "loss": 0.88370705, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.35571289, + "step": 653, + "time_per_iteration": 2.96162748336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147653, + "balance_loss_mlp": 1.10974443, + "epoch": 0.12581762216237014, + "flos": 563441923584.0, + "grad_norm": 0.06492259826928527, + "language_loss": 0.87890899, + "learning_rate": 0.0009761215939223267, + "loss": 0.89038551, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.37890625, + "step": 654, + "time_per_iteration": 2.714641809463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145182, + "balance_loss_mlp": 1.1077261, + "epoch": 0.1260100038476337, + "flos": 481642785792.0, + "grad_norm": 0.07920721431290144, + "language_loss": 0.86875665, + "learning_rate": 0.0009760263749743428, + "loss": 0.88020849, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.37426758, + "step": 655, + "time_per_iteration": 2.547499179840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145343, + "balance_loss_mlp": 1.11074805, + "epoch": 0.12620238553289725, + "flos": 575269461504.0, + "grad_norm": 0.06357383816141966, + "language_loss": 0.90176344, + "learning_rate": 0.0009759309712165299, + "loss": 0.91321695, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.34570312, + "step": 656, + "time_per_iteration": 2.693922996520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137509, + "balance_loss_mlp": 1.103248, + "epoch": 0.12639476721816084, + "flos": 530909973504.0, + "grad_norm": 0.07169490366111804, + "language_loss": 0.93258119, + "learning_rate": 0.0009758353826859272, + "loss": 0.94395626, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.34277344, + "step": 657, + "time_per_iteration": 2.5744612216949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139269, + "balance_loss_mlp": 1.10314822, + "epoch": 0.1265871489034244, + "flos": 689654393856.0, + "grad_norm": 0.06860158128637554, + "language_loss": 0.89679217, + "learning_rate": 0.0009757396094196456, + "loss": 0.90818477, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.36132812, + "step": 658, + "time_per_iteration": 2.851700782775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143308, + "balance_loss_mlp": 1.10675859, + "epoch": 0.12677953058868796, + "flos": 536863735296.0, + "grad_norm": 0.0696485175834739, + "language_loss": 0.84555894, + "learning_rate": 0.0009756436514548673, + "loss": 0.85699201, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.36523438, + "step": 659, + "time_per_iteration": 2.7971351146698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122244, + "balance_loss_mlp": 1.08800757, + "epoch": 0.12697191227395152, + "flos": 518749194240.0, + "grad_norm": 0.05327633329409036, + "language_loss": 0.88343394, + "learning_rate": 0.0009755475088288466, + "loss": 0.89465636, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.34228516, + "step": 660, + "time_per_iteration": 2.670555353164673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103127, + "balance_loss_mlp": 1.06903291, + "epoch": 0.12716429395921508, + "flos": 566341714944.0, + "grad_norm": 0.06801254087798507, + "language_loss": 0.90210187, + "learning_rate": 0.0009754511815789095, + "loss": 0.91313314, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.34106445, + "step": 661, + "time_per_iteration": 2.748224973678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102552, + "balance_loss_mlp": 1.06798172, + "epoch": 0.12735667564447864, + "flos": 513844466688.0, + "grad_norm": 0.06975204014846512, + "language_loss": 0.86245489, + "learning_rate": 0.0009753546697424533, + "loss": 0.87348044, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.34594727, + "step": 662, + "time_per_iteration": 2.664799213409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092108, + "balance_loss_mlp": 1.05863369, + "epoch": 0.1275490573297422, + "flos": 541023556608.0, + "grad_norm": 0.05485824904298714, + "language_loss": 0.90572149, + "learning_rate": 0.0009752579733569475, + "loss": 0.91664255, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.3347168, + "step": 663, + "time_per_iteration": 2.679975748062134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267369, + "balance_loss_mlp": 1.2515384, + "epoch": 0.12774143901500576, + "flos": 1557872787456.0, + "grad_norm": 0.0685532780556388, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7614876, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.15820312, + "step": 664, + "time_per_iteration": 4.938101053237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096151, + "balance_loss_mlp": 1.06177139, + "epoch": 0.12793382070026935, + "flos": 613462781952.0, + "grad_norm": 0.06920677464457729, + "language_loss": 0.90523887, + "learning_rate": 0.0009750640270890217, + "loss": 0.9162004, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.34375, + "step": 665, + "time_per_iteration": 2.6939845085144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099083, + "balance_loss_mlp": 1.06563258, + "epoch": 0.1281262023855329, + "flos": 707386231296.0, + "grad_norm": 0.06773970450457005, + "language_loss": 0.96531481, + "learning_rate": 0.0009749667772818983, + "loss": 0.9763056, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.33447266, + "step": 666, + "time_per_iteration": 2.967853307723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164762, + "balance_loss_mlp": 1.15131497, + "epoch": 0.12831858407079647, + "flos": 1424250086400.0, + "grad_norm": 0.045177828452490555, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78100705, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.13476562, + "step": 667, + "time_per_iteration": 4.85069465637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093582, + "balance_loss_mlp": 1.05958366, + "epoch": 0.12851096575606002, + "flos": 448869316608.0, + "grad_norm": 0.07778909975942494, + "language_loss": 0.95426726, + "learning_rate": 0.0009747717245101093, + "loss": 0.96520311, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.34008789, + "step": 668, + "time_per_iteration": 2.5234692096710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098998, + "balance_loss_mlp": 1.06519032, + "epoch": 0.12870334744132358, + "flos": 479697486336.0, + "grad_norm": 0.05465485885236262, + "language_loss": 0.84969366, + "learning_rate": 0.00097467392162117, + "loss": 0.86068368, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.33789062, + "step": 669, + "time_per_iteration": 2.601684808731079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096385, + "balance_loss_mlp": 1.06341171, + "epoch": 0.12889572912658714, + "flos": 638633963520.0, + "grad_norm": 0.05954757179165737, + "language_loss": 0.91292465, + "learning_rate": 0.0009745759344474708, + "loss": 0.92388856, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.32983398, + "step": 670, + "time_per_iteration": 2.8225347995758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098806, + "balance_loss_mlp": 1.06411648, + "epoch": 0.1290881108118507, + "flos": 509693409792.0, + "grad_norm": 0.06976130099981656, + "language_loss": 0.89229816, + "learning_rate": 0.0009744777630270536, + "loss": 0.90328622, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.34692383, + "step": 671, + "time_per_iteration": 2.633571147918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109875, + "balance_loss_mlp": 1.07435024, + "epoch": 0.12928049249711426, + "flos": 670746894336.0, + "grad_norm": 0.08011077975608555, + "language_loss": 0.93749923, + "learning_rate": 0.000974379407398032, + "loss": 0.94859791, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.35546875, + "step": 672, + "time_per_iteration": 2.875609874725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093721, + "balance_loss_mlp": 1.06065273, + "epoch": 0.12947287418237785, + "flos": 793158925824.0, + "grad_norm": 0.05850057523774312, + "language_loss": 0.82016242, + "learning_rate": 0.0009742808675985913, + "loss": 0.83109969, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.33056641, + "step": 673, + "time_per_iteration": 3.087738275527954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101029, + "balance_loss_mlp": 1.0646224, + "epoch": 0.1296652558676414, + "flos": 485222054400.0, + "grad_norm": 0.08954381825883409, + "language_loss": 0.9153564, + "learning_rate": 0.0009741821436669876, + "loss": 0.92636657, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.36450195, + "step": 674, + "time_per_iteration": 2.539849281311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101326, + "balance_loss_mlp": 1.06673169, + "epoch": 0.12985763755290497, + "flos": 453226987008.0, + "grad_norm": 0.0648114016490977, + "language_loss": 0.9288274, + "learning_rate": 0.0009740832356415492, + "loss": 0.93984067, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.34619141, + "step": 675, + "time_per_iteration": 2.467801094055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097673, + "balance_loss_mlp": 1.06315041, + "epoch": 0.13005001923816853, + "flos": 824719007232.0, + "grad_norm": 0.0735546441878898, + "language_loss": 0.8857609, + "learning_rate": 0.0009739841435606756, + "loss": 0.89673769, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.34545898, + "step": 676, + "time_per_iteration": 3.008781909942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109741, + "balance_loss_mlp": 1.06457949, + "epoch": 0.1302424009234321, + "flos": 531107822592.0, + "grad_norm": 0.07312926894822828, + "language_loss": 0.90675485, + "learning_rate": 0.0009738848674628377, + "loss": 0.9177289, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.328125, + "step": 677, + "time_per_iteration": 2.695338010787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104607, + "balance_loss_mlp": 1.06955981, + "epoch": 0.13043478260869565, + "flos": 525626924544.0, + "grad_norm": 0.06033597827839572, + "language_loss": 0.89643902, + "learning_rate": 0.000973785407386578, + "loss": 0.90748513, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.35058594, + "step": 678, + "time_per_iteration": 2.7727599143981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101976, + "balance_loss_mlp": 1.06714272, + "epoch": 0.1306271642939592, + "flos": 625862108160.0, + "grad_norm": 0.05570081952525763, + "language_loss": 0.87361526, + "learning_rate": 0.0009736857633705103, + "loss": 0.88463503, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.34814453, + "step": 679, + "time_per_iteration": 2.843129873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110176, + "balance_loss_mlp": 1.06630766, + "epoch": 0.13081954597922277, + "flos": 550438723584.0, + "grad_norm": 0.06405817655948583, + "language_loss": 0.93204647, + "learning_rate": 0.0009735859354533196, + "loss": 0.94306409, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.35473633, + "step": 680, + "time_per_iteration": 2.7122464179992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093118, + "balance_loss_mlp": 1.05914354, + "epoch": 0.13101192766448633, + "flos": 536651329536.0, + "grad_norm": 0.06779912020183775, + "language_loss": 0.91948998, + "learning_rate": 0.0009734859236737628, + "loss": 0.93042123, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.33984375, + "step": 681, + "time_per_iteration": 2.594881296157837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093326, + "balance_loss_mlp": 1.0593034, + "epoch": 0.13120430934974991, + "flos": 503258019840.0, + "grad_norm": 0.06413082246497326, + "language_loss": 0.93904501, + "learning_rate": 0.0009733857280706678, + "loss": 0.94997829, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.34033203, + "step": 682, + "time_per_iteration": 2.5831425189971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010929, + "balance_loss_mlp": 1.05992687, + "epoch": 0.13139669103501347, + "flos": 614014221312.0, + "grad_norm": 0.06246118190021366, + "language_loss": 0.85051745, + "learning_rate": 0.000973285348682934, + "loss": 0.86144638, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.33007812, + "step": 683, + "time_per_iteration": 2.7236225605010986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226892, + "balance_loss_mlp": 1.21096563, + "epoch": 0.13158907272027703, + "flos": 1484163357696.0, + "grad_norm": 0.08359566880013784, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.79125261, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.15917969, + "step": 684, + "time_per_iteration": 4.87854790687561 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087932, + "balance_loss_mlp": 1.05488706, + "epoch": 0.1317814544055406, + "flos": 985049344512.0, + "grad_norm": 0.07039095593234826, + "language_loss": 0.85449159, + "learning_rate": 0.0009730840387095046, + "loss": 0.86537099, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.33056641, + "step": 685, + "time_per_iteration": 3.30759596824646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096943, + "balance_loss_mlp": 1.06156158, + "epoch": 0.13197383609080415, + "flos": 611163892224.0, + "grad_norm": 0.05759402546544749, + "language_loss": 0.912597, + "learning_rate": 0.0009729831082019642, + "loss": 0.92356646, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.35351562, + "step": 686, + "time_per_iteration": 2.7965087890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093388, + "balance_loss_mlp": 1.0589608, + "epoch": 0.1321662177760677, + "flos": 494116305408.0, + "grad_norm": 0.058033147986452156, + "language_loss": 0.89668858, + "learning_rate": 0.0009728819940660958, + "loss": 0.90762246, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.34399414, + "step": 687, + "time_per_iteration": 2.7347469329833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110102, + "balance_loss_mlp": 1.0653528, + "epoch": 0.13235859946133127, + "flos": 495591713280.0, + "grad_norm": 0.07548862234195632, + "language_loss": 0.86088693, + "learning_rate": 0.0009727806963411557, + "loss": 0.87189722, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.35668945, + "step": 688, + "time_per_iteration": 2.621638774871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098222, + "balance_loss_mlp": 1.06279302, + "epoch": 0.13255098114659483, + "flos": 511417539072.0, + "grad_norm": 0.08656773393569435, + "language_loss": 0.88000298, + "learning_rate": 0.000972679215066471, + "loss": 0.89098513, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.35449219, + "step": 689, + "time_per_iteration": 2.6806418895721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103388, + "balance_loss_mlp": 1.06900764, + "epoch": 0.13274336283185842, + "flos": 547114120704.0, + "grad_norm": 0.07064056682134613, + "language_loss": 0.99675226, + "learning_rate": 0.0009725775502814401, + "loss": 1.00778604, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.34350586, + "step": 690, + "time_per_iteration": 2.607179641723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121046, + "balance_loss_mlp": 1.08397222, + "epoch": 0.13293574451712198, + "flos": 640465781760.0, + "grad_norm": 0.08777481913975324, + "language_loss": 0.85673726, + "learning_rate": 0.0009724757020255327, + "loss": 0.86794776, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.37084961, + "step": 691, + "time_per_iteration": 2.81113338470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111244, + "balance_loss_mlp": 1.07726967, + "epoch": 0.13312812620238554, + "flos": 491234042880.0, + "grad_norm": 0.09165524457583717, + "language_loss": 0.87811983, + "learning_rate": 0.0009723736703382902, + "loss": 0.88923222, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.33984375, + "step": 692, + "time_per_iteration": 2.548689603805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110186, + "balance_loss_mlp": 1.0692203, + "epoch": 0.1333205078876491, + "flos": 508693837824.0, + "grad_norm": 0.061462060991887495, + "language_loss": 0.83746743, + "learning_rate": 0.0009722714552593244, + "loss": 0.84848601, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.32641602, + "step": 693, + "time_per_iteration": 2.6584513187408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099112, + "balance_loss_mlp": 1.06358743, + "epoch": 0.13351288957291266, + "flos": 418474722816.0, + "grad_norm": 0.07144638741394425, + "language_loss": 0.94810003, + "learning_rate": 0.000972169056828319, + "loss": 0.95909119, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.35522461, + "step": 694, + "time_per_iteration": 2.461437702178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100772, + "balance_loss_mlp": 1.06751275, + "epoch": 0.13370527125817622, + "flos": 615614694912.0, + "grad_norm": 0.05672506947017021, + "language_loss": 0.87834966, + "learning_rate": 0.0009720664750850283, + "loss": 0.88935745, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.33251953, + "step": 695, + "time_per_iteration": 2.7716193199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103519, + "balance_loss_mlp": 1.07085609, + "epoch": 0.13389765294343978, + "flos": 625757391360.0, + "grad_norm": 0.07304651625724701, + "language_loss": 0.93482703, + "learning_rate": 0.0009719637100692784, + "loss": 0.94586229, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.32666016, + "step": 696, + "time_per_iteration": 2.7741310596466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111476, + "balance_loss_mlp": 1.08090401, + "epoch": 0.13409003462870334, + "flos": 609391710720.0, + "grad_norm": 0.06235589965882817, + "language_loss": 0.83759153, + "learning_rate": 0.0009718607618209661, + "loss": 0.84873915, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.33862305, + "step": 697, + "time_per_iteration": 2.869180202484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128671, + "balance_loss_mlp": 1.09488726, + "epoch": 0.13428241631396692, + "flos": 683499810816.0, + "grad_norm": 0.0709058406100417, + "language_loss": 0.88053036, + "learning_rate": 0.0009717576303800595, + "loss": 0.89181709, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.33789062, + "step": 698, + "time_per_iteration": 3.007253408432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122193, + "balance_loss_mlp": 1.08716917, + "epoch": 0.13447479799923048, + "flos": 508565799936.0, + "grad_norm": 0.07060238478807088, + "language_loss": 0.86057615, + "learning_rate": 0.0009716543157865975, + "loss": 0.87179804, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.35083008, + "step": 699, + "time_per_iteration": 2.6622114181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112811, + "balance_loss_mlp": 1.07812154, + "epoch": 0.13466717968449404, + "flos": 897124737024.0, + "grad_norm": 0.06896685381510245, + "language_loss": 0.84149206, + "learning_rate": 0.0009715508180806907, + "loss": 0.85262012, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.34716797, + "step": 700, + "time_per_iteration": 3.175494909286499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106112, + "balance_loss_mlp": 1.07054055, + "epoch": 0.1348595613697576, + "flos": 989501557248.0, + "grad_norm": 0.07388845252403331, + "language_loss": 0.90260321, + "learning_rate": 0.0009714471373025202, + "loss": 0.91366434, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.35546875, + "step": 701, + "time_per_iteration": 3.3912835121154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090254, + "balance_loss_mlp": 1.05499172, + "epoch": 0.13505194305502116, + "flos": 487580580864.0, + "grad_norm": 0.07959074518459132, + "language_loss": 0.89355272, + "learning_rate": 0.0009713432734923386, + "loss": 0.9044553, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.35253906, + "step": 702, + "time_per_iteration": 2.6718733310699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090728, + "balance_loss_mlp": 1.05572796, + "epoch": 0.13524432474028472, + "flos": 613103399424.0, + "grad_norm": 0.06387437846302528, + "language_loss": 0.875036, + "learning_rate": 0.0009712392266904696, + "loss": 0.88594317, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.34985352, + "step": 703, + "time_per_iteration": 2.6985831260681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010863, + "balance_loss_mlp": 1.0524683, + "epoch": 0.13543670642554828, + "flos": 904425868800.0, + "grad_norm": 0.06666466963859687, + "language_loss": 0.86250496, + "learning_rate": 0.0009711349969373076, + "loss": 0.87336791, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.33862305, + "step": 704, + "time_per_iteration": 3.1328465938568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095762, + "balance_loss_mlp": 1.0610956, + "epoch": 0.13562908811081184, + "flos": 550335416832.0, + "grad_norm": 0.0628446006314887, + "language_loss": 0.80944061, + "learning_rate": 0.0009710305842733178, + "loss": 0.82039821, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.34667969, + "step": 705, + "time_per_iteration": 2.7668187618255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093976, + "balance_loss_mlp": 1.06147909, + "epoch": 0.1358214697960754, + "flos": 507797572608.0, + "grad_norm": 0.06635154625105166, + "language_loss": 0.90133065, + "learning_rate": 0.0009709259887390373, + "loss": 0.91227043, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.32519531, + "step": 706, + "time_per_iteration": 2.656233072280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096924, + "balance_loss_mlp": 1.06390333, + "epoch": 0.136013851481339, + "flos": 528640197120.0, + "grad_norm": 0.09290535615143355, + "language_loss": 0.91425377, + "learning_rate": 0.0009708212103750737, + "loss": 0.92522299, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.33007812, + "step": 707, + "time_per_iteration": 2.569655656814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101147, + "balance_loss_mlp": 1.06812644, + "epoch": 0.13620623316660255, + "flos": 658772379648.0, + "grad_norm": 0.06731423560591156, + "language_loss": 0.87756282, + "learning_rate": 0.0009707162492221051, + "loss": 0.88857424, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.33007812, + "step": 708, + "time_per_iteration": 2.880669593811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103707, + "balance_loss_mlp": 1.07009029, + "epoch": 0.1363986148518661, + "flos": 671583522816.0, + "grad_norm": 0.07312175328849302, + "language_loss": 0.88322687, + "learning_rate": 0.0009706111053208815, + "loss": 0.89426386, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.33642578, + "step": 709, + "time_per_iteration": 2.7878787517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097257, + "balance_loss_mlp": 1.06342554, + "epoch": 0.13659099653712967, + "flos": 472828520448.0, + "grad_norm": 0.06741688104713542, + "language_loss": 0.86067665, + "learning_rate": 0.0009705057787122232, + "loss": 0.87164921, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.33862305, + "step": 710, + "time_per_iteration": 2.528298854827881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105446, + "balance_loss_mlp": 1.07190061, + "epoch": 0.13678337822239323, + "flos": 452483490816.0, + "grad_norm": 0.05706590332145298, + "language_loss": 0.91653168, + "learning_rate": 0.0009704002694370216, + "loss": 0.92758614, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.33569336, + "step": 711, + "time_per_iteration": 2.5201761722564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114394, + "balance_loss_mlp": 1.0794661, + "epoch": 0.13697575990765679, + "flos": 519373416960.0, + "grad_norm": 0.06387130477766731, + "language_loss": 0.86892813, + "learning_rate": 0.0009702945775362388, + "loss": 0.88007212, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.34960938, + "step": 712, + "time_per_iteration": 2.661848783493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130562, + "balance_loss_mlp": 1.0947994, + "epoch": 0.13716814159292035, + "flos": 480145618944.0, + "grad_norm": 0.06038249383316015, + "language_loss": 0.87339497, + "learning_rate": 0.0009701887030509086, + "loss": 0.8847006, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.35766602, + "step": 713, + "time_per_iteration": 2.6068434715270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125148, + "balance_loss_mlp": 1.0908401, + "epoch": 0.1373605232781839, + "flos": 545376844800.0, + "grad_norm": 0.06924339631343991, + "language_loss": 0.92127877, + "learning_rate": 0.0009700826460221346, + "loss": 0.93253028, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.34301758, + "step": 714, + "time_per_iteration": 2.653224468231201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145818, + "balance_loss_mlp": 1.11050797, + "epoch": 0.1375529049634475, + "flos": 708473143296.0, + "grad_norm": 0.0682346884445605, + "language_loss": 0.93435562, + "learning_rate": 0.0009699764064910921, + "loss": 0.94581378, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.35302734, + "step": 715, + "time_per_iteration": 2.878445625305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130305, + "balance_loss_mlp": 1.09542441, + "epoch": 0.13774528664871105, + "flos": 486452971008.0, + "grad_norm": 0.07091873756636237, + "language_loss": 0.87931371, + "learning_rate": 0.0009698699844990268, + "loss": 0.89061677, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.34863281, + "step": 716, + "time_per_iteration": 2.6278092861175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124133, + "balance_loss_mlp": 1.09070659, + "epoch": 0.1379376683339746, + "flos": 679885636608.0, + "grad_norm": 0.0686032560828043, + "language_loss": 0.88731855, + "learning_rate": 0.0009697633800872555, + "loss": 0.89855987, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.33422852, + "step": 717, + "time_per_iteration": 2.888576030731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112997, + "balance_loss_mlp": 1.07825947, + "epoch": 0.13813005001923817, + "flos": 610628419584.0, + "grad_norm": 0.07907714555147631, + "language_loss": 0.9128629, + "learning_rate": 0.0009696565932971655, + "loss": 0.92399287, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.34741211, + "step": 718, + "time_per_iteration": 2.8937225341796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110249, + "balance_loss_mlp": 1.06837237, + "epoch": 0.13832243170450173, + "flos": 588431222784.0, + "grad_norm": 0.05947825646897862, + "language_loss": 0.9001984, + "learning_rate": 0.0009695496241702153, + "loss": 0.91122329, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.34155273, + "step": 719, + "time_per_iteration": 2.791111469268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094313, + "balance_loss_mlp": 1.06093454, + "epoch": 0.1385148133897653, + "flos": 699674844672.0, + "grad_norm": 0.07440757355955382, + "language_loss": 0.86308432, + "learning_rate": 0.0009694424727479339, + "loss": 0.87402749, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.33398438, + "step": 720, + "time_per_iteration": 2.8781325817108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088445, + "balance_loss_mlp": 1.05475688, + "epoch": 0.13870719507502885, + "flos": 597977399808.0, + "grad_norm": 0.059872525751604476, + "language_loss": 0.90073895, + "learning_rate": 0.0009693351390719213, + "loss": 0.91162348, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.3371582, + "step": 721, + "time_per_iteration": 2.691493272781372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095999, + "balance_loss_mlp": 1.06242967, + "epoch": 0.1388995767602924, + "flos": 586279309824.0, + "grad_norm": 0.07792099406652078, + "language_loss": 0.91640067, + "learning_rate": 0.000969227623183848, + "loss": 0.92736065, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.33569336, + "step": 722, + "time_per_iteration": 2.768209218978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086676, + "balance_loss_mlp": 1.05475235, + "epoch": 0.139091958445556, + "flos": 650810709504.0, + "grad_norm": 0.07717859695455091, + "language_loss": 0.91485119, + "learning_rate": 0.0009691199251254554, + "loss": 0.92571795, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.3190918, + "step": 723, + "time_per_iteration": 2.813594102859497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093708, + "balance_loss_mlp": 1.06159282, + "epoch": 0.13928434013081956, + "flos": 575446961664.0, + "grad_norm": 0.06414169604653322, + "language_loss": 0.8718468, + "learning_rate": 0.0009690120449385555, + "loss": 0.88278389, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.32104492, + "step": 724, + "time_per_iteration": 2.732372999191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110008, + "balance_loss_mlp": 1.06574821, + "epoch": 0.13947672181608312, + "flos": 562954503168.0, + "grad_norm": 0.07538454681544235, + "language_loss": 0.93399024, + "learning_rate": 0.0009689039826650312, + "loss": 0.94499099, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.34375, + "step": 725, + "time_per_iteration": 2.769481658935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111743, + "balance_loss_mlp": 1.09967864, + "epoch": 0.13966910350134668, + "flos": 1520699387904.0, + "grad_norm": 0.042030956775344956, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77634799, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.12060547, + "step": 726, + "time_per_iteration": 4.903716802597046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101674, + "balance_loss_mlp": 1.06619751, + "epoch": 0.13986148518661023, + "flos": 499604557824.0, + "grad_norm": 0.07361028590256702, + "language_loss": 0.88265646, + "learning_rate": 0.0009686873120259941, + "loss": 0.89367324, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.35522461, + "step": 727, + "time_per_iteration": 2.639673948287964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099007, + "balance_loss_mlp": 1.06612897, + "epoch": 0.1400538668718738, + "flos": 598381862400.0, + "grad_norm": 0.053177263225715844, + "language_loss": 0.87612498, + "learning_rate": 0.0009685787037446004, + "loss": 0.88711506, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.32885742, + "step": 728, + "time_per_iteration": 2.7457332611083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095755, + "balance_loss_mlp": 1.06135106, + "epoch": 0.14024624855713735, + "flos": 593757941760.0, + "grad_norm": 0.0730266030670127, + "language_loss": 0.88032103, + "learning_rate": 0.0009684699135448201, + "loss": 0.89127851, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.34423828, + "step": 729, + "time_per_iteration": 2.6995558738708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091636, + "balance_loss_mlp": 1.05940139, + "epoch": 0.1404386302424009, + "flos": 506335311360.0, + "grad_norm": 0.06378774069808751, + "language_loss": 0.93033969, + "learning_rate": 0.0009683609414688895, + "loss": 0.94125605, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.32226562, + "step": 730, + "time_per_iteration": 2.6648926734924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097348, + "balance_loss_mlp": 1.06175184, + "epoch": 0.14063101192766447, + "flos": 573132105216.0, + "grad_norm": 0.05452232030629634, + "language_loss": 0.86945236, + "learning_rate": 0.0009682517875591154, + "loss": 0.88042581, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.35620117, + "step": 731, + "time_per_iteration": 2.7333967685699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099629, + "balance_loss_mlp": 1.06656027, + "epoch": 0.14082339361292806, + "flos": 564333806592.0, + "grad_norm": 0.06482276791137384, + "language_loss": 0.87207299, + "learning_rate": 0.0009681424518578749, + "loss": 0.88306928, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.33081055, + "step": 732, + "time_per_iteration": 2.706704616546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100389, + "balance_loss_mlp": 1.06734443, + "epoch": 0.14101577529819162, + "flos": 463336187904.0, + "grad_norm": 0.05411989278901109, + "language_loss": 0.88122302, + "learning_rate": 0.000968032934407616, + "loss": 0.89222693, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.33056641, + "step": 733, + "time_per_iteration": 2.5904436111450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100997, + "balance_loss_mlp": 1.06766593, + "epoch": 0.14120815698345518, + "flos": 595791991296.0, + "grad_norm": 0.06321555834593343, + "language_loss": 0.82077157, + "learning_rate": 0.0009679232352508571, + "loss": 0.83178151, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.33349609, + "step": 734, + "time_per_iteration": 2.758493423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102299, + "balance_loss_mlp": 1.06992185, + "epoch": 0.14140053866871874, + "flos": 534864591360.0, + "grad_norm": 0.05697576898708014, + "language_loss": 0.81442666, + "learning_rate": 0.0009678133544301871, + "loss": 0.82544965, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.32373047, + "step": 735, + "time_per_iteration": 2.6508195400238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092208, + "balance_loss_mlp": 1.06006956, + "epoch": 0.1415929203539823, + "flos": 520013606400.0, + "grad_norm": 0.0400187761209974, + "language_loss": 0.91843486, + "learning_rate": 0.0009677032919882658, + "loss": 0.92935699, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.32128906, + "step": 736, + "time_per_iteration": 2.705019474029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100492, + "balance_loss_mlp": 1.06975937, + "epoch": 0.14178530203924586, + "flos": 482095300608.0, + "grad_norm": 0.07179339183341249, + "language_loss": 0.92199683, + "learning_rate": 0.000967593047967823, + "loss": 0.93300164, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.30712891, + "step": 737, + "time_per_iteration": 2.55415415763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109678, + "balance_loss_mlp": 1.07577443, + "epoch": 0.14197768372450942, + "flos": 676339863552.0, + "grad_norm": 0.08640894081958116, + "language_loss": 0.87084705, + "learning_rate": 0.0009674826224116593, + "loss": 0.88194382, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.33911133, + "step": 738, + "time_per_iteration": 2.819878101348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097633, + "balance_loss_mlp": 1.06544614, + "epoch": 0.14217006540977298, + "flos": 445802199552.0, + "grad_norm": 0.06953952980021996, + "language_loss": 0.8713401, + "learning_rate": 0.0009673720153626455, + "loss": 0.88231641, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.32177734, + "step": 739, + "time_per_iteration": 2.5987422466278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096281, + "balance_loss_mlp": 1.06385565, + "epoch": 0.14236244709503657, + "flos": 496261016064.0, + "grad_norm": 0.08400230511878481, + "language_loss": 0.87465405, + "learning_rate": 0.0009672612268637235, + "loss": 0.88561684, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.32421875, + "step": 740, + "time_per_iteration": 2.6148736476898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098472, + "balance_loss_mlp": 1.06669128, + "epoch": 0.14255482878030012, + "flos": 648022989312.0, + "grad_norm": 0.0806935070673247, + "language_loss": 0.846753, + "learning_rate": 0.0009671502569579048, + "loss": 0.85773772, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.31762695, + "step": 741, + "time_per_iteration": 2.7533769607543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089599, + "balance_loss_mlp": 1.05774641, + "epoch": 0.14274721046556368, + "flos": 535888894464.0, + "grad_norm": 0.06572551706098649, + "language_loss": 0.90748239, + "learning_rate": 0.0009670391056882719, + "loss": 0.91837835, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.31835938, + "step": 742, + "time_per_iteration": 2.698690176010132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088511, + "balance_loss_mlp": 1.0565629, + "epoch": 0.14293959215082724, + "flos": 956677215744.0, + "grad_norm": 0.07291469749344824, + "language_loss": 0.89417249, + "learning_rate": 0.0009669277730979776, + "loss": 0.90505755, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.31958008, + "step": 743, + "time_per_iteration": 3.1728732585906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108649, + "balance_loss_mlp": 1.05408931, + "epoch": 0.1431319738360908, + "flos": 692766590976.0, + "grad_norm": 0.06693583917292938, + "language_loss": 0.85588205, + "learning_rate": 0.0009668162592302449, + "loss": 0.86674696, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.32397461, + "step": 744, + "time_per_iteration": 2.896467685699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099426, + "balance_loss_mlp": 1.06673896, + "epoch": 0.14332435552135436, + "flos": 565174817280.0, + "grad_norm": 0.0717564206721674, + "language_loss": 0.86683381, + "learning_rate": 0.0009667045641283676, + "loss": 0.877828, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.3269043, + "step": 745, + "time_per_iteration": 2.6326427459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095955, + "balance_loss_mlp": 1.06336319, + "epoch": 0.14351673720661792, + "flos": 738045665280.0, + "grad_norm": 0.07083856064802352, + "language_loss": 0.95545924, + "learning_rate": 0.0009665926878357092, + "loss": 0.96641874, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.32592773, + "step": 746, + "time_per_iteration": 2.902628183364868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108393, + "balance_loss_mlp": 1.07565856, + "epoch": 0.14370911889188148, + "flos": 548951731200.0, + "grad_norm": 0.08672542857876225, + "language_loss": 0.91510898, + "learning_rate": 0.0009664806303957043, + "loss": 0.92619288, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.32714844, + "step": 747, + "time_per_iteration": 2.678656578063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107271, + "balance_loss_mlp": 1.07448816, + "epoch": 0.14390150057714507, + "flos": 589973469696.0, + "grad_norm": 0.06575006445724518, + "language_loss": 0.87633115, + "learning_rate": 0.0009663683918518571, + "loss": 0.88740385, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.32788086, + "step": 748, + "time_per_iteration": 2.894339084625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116744, + "balance_loss_mlp": 1.08226848, + "epoch": 0.14409388226240863, + "flos": 590773782528.0, + "grad_norm": 0.06412555003569581, + "language_loss": 0.86334193, + "learning_rate": 0.0009662559722477428, + "loss": 0.87450933, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.3449707, + "step": 749, + "time_per_iteration": 2.6673357486724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116866, + "balance_loss_mlp": 1.15397346, + "epoch": 0.1442862639476722, + "flos": 1510418479104.0, + "grad_norm": 0.05654081816866197, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77331638, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.14648438, + "step": 750, + "time_per_iteration": 4.97744607925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111461, + "balance_loss_mlp": 1.0782733, + "epoch": 0.14447864563293575, + "flos": 496493770752.0, + "grad_norm": 0.05840496998451829, + "language_loss": 0.89989787, + "learning_rate": 0.0009660305900333632, + "loss": 0.91101241, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.33203125, + "step": 751, + "time_per_iteration": 2.6919631958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108129, + "balance_loss_mlp": 1.07513142, + "epoch": 0.1446710273181993, + "flos": 589400271360.0, + "grad_norm": 0.0663289310880325, + "language_loss": 0.83084202, + "learning_rate": 0.0009659176275105992, + "loss": 0.8419233, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.33007812, + "step": 752, + "time_per_iteration": 2.702003240585327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097403, + "balance_loss_mlp": 1.0634284, + "epoch": 0.14486340900346287, + "flos": 585521256960.0, + "grad_norm": 0.05748666507804042, + "language_loss": 0.86628646, + "learning_rate": 0.0009658044841025701, + "loss": 0.87726045, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.34008789, + "step": 753, + "time_per_iteration": 2.7666702270507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106626, + "balance_loss_mlp": 1.07114923, + "epoch": 0.14505579068872643, + "flos": 504405978624.0, + "grad_norm": 0.07320865998852653, + "language_loss": 0.81996346, + "learning_rate": 0.0009656911598532021, + "loss": 0.83102977, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.35498047, + "step": 754, + "time_per_iteration": 2.6273839473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094053, + "balance_loss_mlp": 1.05936301, + "epoch": 0.14524817237399, + "flos": 486566452224.0, + "grad_norm": 0.05776902712696923, + "language_loss": 0.90229332, + "learning_rate": 0.0009655776548064917, + "loss": 0.91323388, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.34667969, + "step": 755, + "time_per_iteration": 2.6639461517333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092368, + "balance_loss_mlp": 1.05867922, + "epoch": 0.14544055405925355, + "flos": 727857888768.0, + "grad_norm": 0.059694446461720084, + "language_loss": 0.88762641, + "learning_rate": 0.0009654639690065054, + "loss": 0.89855003, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.33691406, + "step": 756, + "time_per_iteration": 2.881164789199829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092737, + "balance_loss_mlp": 1.05981112, + "epoch": 0.14563293574451713, + "flos": 593359271424.0, + "grad_norm": 0.0719411984245977, + "language_loss": 0.88362074, + "learning_rate": 0.00096535010249738, + "loss": 0.89454818, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.3293457, + "step": 757, + "time_per_iteration": 2.703355312347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092325, + "balance_loss_mlp": 1.05925632, + "epoch": 0.1458253174297807, + "flos": 560192924160.0, + "grad_norm": 0.09095988428785044, + "language_loss": 0.8300786, + "learning_rate": 0.0009652360553233224, + "loss": 0.84100187, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.33081055, + "step": 758, + "time_per_iteration": 2.7321062088012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062711, + "balance_loss_mlp": 1.04821551, + "epoch": 0.14601769911504425, + "flos": 1557025984512.0, + "grad_norm": 0.03493248396843453, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74836457, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.14453125, + "step": 759, + "time_per_iteration": 4.917184591293335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099151, + "balance_loss_mlp": 1.06605887, + "epoch": 0.1462100808003078, + "flos": 865922628096.0, + "grad_norm": 0.05465610046720203, + "language_loss": 0.8166393, + "learning_rate": 0.0009650074191575883, + "loss": 0.82763088, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.33105469, + "step": 760, + "time_per_iteration": 3.2009472846984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097005, + "balance_loss_mlp": 1.06341171, + "epoch": 0.14640246248557137, + "flos": 522673288704.0, + "grad_norm": 0.07890258703475667, + "language_loss": 0.86329532, + "learning_rate": 0.0009648928302546766, + "loss": 0.87426543, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.3359375, + "step": 761, + "time_per_iteration": 2.6858482360839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087335, + "balance_loss_mlp": 1.05340791, + "epoch": 0.14659484417083493, + "flos": 1030121805312.0, + "grad_norm": 0.05505233607608704, + "language_loss": 0.8584463, + "learning_rate": 0.0009647780608643613, + "loss": 0.86931968, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.33935547, + "step": 762, + "time_per_iteration": 3.3784618377685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087006, + "balance_loss_mlp": 1.05365133, + "epoch": 0.1467872258560985, + "flos": 500426629632.0, + "grad_norm": 0.083565321416964, + "language_loss": 0.88299912, + "learning_rate": 0.0009646631110312001, + "loss": 0.89386916, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.33349609, + "step": 763, + "time_per_iteration": 2.642038345336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096574, + "balance_loss_mlp": 1.06465006, + "epoch": 0.14697960754136205, + "flos": 547514201088.0, + "grad_norm": 0.05646167170610495, + "language_loss": 0.88908124, + "learning_rate": 0.0009645479807998203, + "loss": 0.900047, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.3190918, + "step": 764, + "time_per_iteration": 2.7709102630615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093321, + "balance_loss_mlp": 1.0614922, + "epoch": 0.14717198922662564, + "flos": 517586678784.0, + "grad_norm": 0.06731397985108602, + "language_loss": 0.93233657, + "learning_rate": 0.0009644326702149196, + "loss": 0.94326979, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.31811523, + "step": 765, + "time_per_iteration": 2.691761016845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098194, + "balance_loss_mlp": 1.06472015, + "epoch": 0.1473643709118892, + "flos": 731661147648.0, + "grad_norm": 0.08664060064789567, + "language_loss": 0.85604531, + "learning_rate": 0.0009643171793212653, + "loss": 0.86702728, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.33496094, + "step": 766, + "time_per_iteration": 3.0578510761260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095374, + "balance_loss_mlp": 1.06190002, + "epoch": 0.14755675259715276, + "flos": 620257554432.0, + "grad_norm": 0.06875066800131625, + "language_loss": 0.90379435, + "learning_rate": 0.0009642015081636952, + "loss": 0.91474807, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.33496094, + "step": 767, + "time_per_iteration": 2.690892219543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091039, + "balance_loss_mlp": 1.05830407, + "epoch": 0.14774913428241632, + "flos": 451981513728.0, + "grad_norm": 0.06617868208271054, + "language_loss": 0.88812423, + "learning_rate": 0.0009640856567871166, + "loss": 0.89903462, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.32714844, + "step": 768, + "time_per_iteration": 2.5108768939971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086849, + "balance_loss_mlp": 1.05316067, + "epoch": 0.14794151596767988, + "flos": 836881196544.0, + "grad_norm": 0.06813910901976611, + "language_loss": 0.89643073, + "learning_rate": 0.0009639696252365072, + "loss": 0.90729922, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.33691406, + "step": 769, + "time_per_iteration": 3.036872386932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087914, + "balance_loss_mlp": 1.05546558, + "epoch": 0.14813389765294344, + "flos": 685765204992.0, + "grad_norm": 0.06952898718112278, + "language_loss": 0.82433641, + "learning_rate": 0.0009638534135569144, + "loss": 0.83521557, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.32446289, + "step": 770, + "time_per_iteration": 2.920228958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096521, + "balance_loss_mlp": 1.06395316, + "epoch": 0.148326279338207, + "flos": 509625008640.0, + "grad_norm": 0.05850145176667806, + "language_loss": 0.90417981, + "learning_rate": 0.0009637370217934554, + "loss": 0.91514498, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.32568359, + "step": 771, + "time_per_iteration": 2.6692943572998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.0624088, + "epoch": 0.14851866102347056, + "flos": 587869608960.0, + "grad_norm": 0.06374792966079154, + "language_loss": 0.83362675, + "learning_rate": 0.0009636204499913175, + "loss": 0.84457153, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.32055664, + "step": 772, + "time_per_iteration": 2.9103784561157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101595, + "balance_loss_mlp": 1.07129157, + "epoch": 0.14871104270873411, + "flos": 690722366976.0, + "grad_norm": 0.05784692032564958, + "language_loss": 0.8891257, + "learning_rate": 0.0009635036981957581, + "loss": 0.90014172, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.30273438, + "step": 773, + "time_per_iteration": 2.840233087539673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109331, + "balance_loss_mlp": 1.06112361, + "epoch": 0.1489034243939977, + "flos": 654803205120.0, + "grad_norm": 0.06091674471201955, + "language_loss": 0.9126395, + "learning_rate": 0.0009633867664521043, + "loss": 0.9235726, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.32202148, + "step": 774, + "time_per_iteration": 2.8467912673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098219, + "balance_loss_mlp": 1.0643878, + "epoch": 0.14909580607926126, + "flos": 475595891712.0, + "grad_norm": 0.06395321005815084, + "language_loss": 0.87366414, + "learning_rate": 0.0009632696548057527, + "loss": 0.8846463, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.33862305, + "step": 775, + "time_per_iteration": 2.55267596244812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090534, + "balance_loss_mlp": 1.05729866, + "epoch": 0.14928818776452482, + "flos": 610789953024.0, + "grad_norm": 0.07257335679926562, + "language_loss": 0.85489643, + "learning_rate": 0.0009631523633021704, + "loss": 0.86580181, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.33251953, + "step": 776, + "time_per_iteration": 2.800656795501709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090063, + "balance_loss_mlp": 1.05694628, + "epoch": 0.14948056944978838, + "flos": 561487859712.0, + "grad_norm": 0.058446141184189525, + "language_loss": 0.88943005, + "learning_rate": 0.0009630348919868936, + "loss": 0.90033066, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.33129883, + "step": 777, + "time_per_iteration": 2.7306644916534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088502, + "balance_loss_mlp": 1.05397916, + "epoch": 0.14967295113505194, + "flos": 448972623360.0, + "grad_norm": 0.08136314957760014, + "language_loss": 0.81536144, + "learning_rate": 0.0009629172409055293, + "loss": 0.82624644, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.34545898, + "step": 778, + "time_per_iteration": 2.532480239868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091534, + "balance_loss_mlp": 1.05937171, + "epoch": 0.1498653328203155, + "flos": 571000541184.0, + "grad_norm": 0.06865521140792329, + "language_loss": 0.88039231, + "learning_rate": 0.0009627994101037531, + "loss": 0.89130771, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.3215332, + "step": 779, + "time_per_iteration": 2.7336056232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091586, + "balance_loss_mlp": 1.05811191, + "epoch": 0.15005771450557906, + "flos": 630918194688.0, + "grad_norm": 0.06277485509918372, + "language_loss": 0.8981787, + "learning_rate": 0.0009626813996273114, + "loss": 0.90909451, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.3347168, + "step": 780, + "time_per_iteration": 2.8651859760284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092654, + "balance_loss_mlp": 1.06018162, + "epoch": 0.15025009619084262, + "flos": 577633780224.0, + "grad_norm": 0.06737111741199381, + "language_loss": 0.89359641, + "learning_rate": 0.0009625632095220198, + "loss": 0.90452296, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.32470703, + "step": 781, + "time_per_iteration": 2.910163640975952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093984, + "balance_loss_mlp": 1.06041455, + "epoch": 0.1504424778761062, + "flos": 483646311936.0, + "grad_norm": 0.06188715182302237, + "language_loss": 0.87568116, + "learning_rate": 0.0009624448398337637, + "loss": 0.88662094, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.3359375, + "step": 782, + "time_per_iteration": 2.532055616378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100397, + "balance_loss_mlp": 1.06751907, + "epoch": 0.15063485956136977, + "flos": 762167812608.0, + "grad_norm": 0.06229794960735175, + "language_loss": 0.89905757, + "learning_rate": 0.0009623262906084984, + "loss": 0.91006154, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.32861328, + "step": 783, + "time_per_iteration": 2.9851605892181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104855, + "balance_loss_mlp": 1.0712378, + "epoch": 0.15082724124663333, + "flos": 497369687040.0, + "grad_norm": 0.060596744514248076, + "language_loss": 0.90796679, + "learning_rate": 0.0009622075618922486, + "loss": 0.91901541, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.33642578, + "step": 784, + "time_per_iteration": 2.6796786785125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102349, + "balance_loss_mlp": 1.06928015, + "epoch": 0.15101962293189689, + "flos": 509476621824.0, + "grad_norm": 0.06389342174673626, + "language_loss": 0.87423813, + "learning_rate": 0.0009620886537311091, + "loss": 0.8852616, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.33081055, + "step": 785, + "time_per_iteration": 2.6153056621551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113163, + "balance_loss_mlp": 1.07685184, + "epoch": 0.15121200461716044, + "flos": 457520638464.0, + "grad_norm": 0.06793935281312648, + "language_loss": 0.85492945, + "learning_rate": 0.000961969566171244, + "loss": 0.86606109, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.36303711, + "step": 786, + "time_per_iteration": 2.506267786026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122272, + "balance_loss_mlp": 1.08703363, + "epoch": 0.151404386302424, + "flos": 537729477120.0, + "grad_norm": 0.0670602351843582, + "language_loss": 0.90370345, + "learning_rate": 0.0009618502992588873, + "loss": 0.91492617, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.35253906, + "step": 787, + "time_per_iteration": 2.623457670211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141844, + "balance_loss_mlp": 1.10658193, + "epoch": 0.15159676798768756, + "flos": 687858891264.0, + "grad_norm": 0.06543467559167064, + "language_loss": 0.88581872, + "learning_rate": 0.0009617308530403424, + "loss": 0.89723718, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.35302734, + "step": 788, + "time_per_iteration": 2.975861072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149381, + "balance_loss_mlp": 1.11371326, + "epoch": 0.15178914967295112, + "flos": 545042193408.0, + "grad_norm": 0.059566397417978756, + "language_loss": 0.87806541, + "learning_rate": 0.0009616112275619825, + "loss": 0.88955921, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.35668945, + "step": 789, + "time_per_iteration": 2.683262348175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152452, + "balance_loss_mlp": 1.1169517, + "epoch": 0.1519815313582147, + "flos": 511510671360.0, + "grad_norm": 0.05728483560240697, + "language_loss": 0.84466863, + "learning_rate": 0.0009614914228702503, + "loss": 0.85619313, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.35498047, + "step": 790, + "time_per_iteration": 2.6616339683532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142719, + "balance_loss_mlp": 1.10850596, + "epoch": 0.15217391304347827, + "flos": 683747122176.0, + "grad_norm": 0.057799273493116435, + "language_loss": 0.89279461, + "learning_rate": 0.0009613714390116581, + "loss": 0.90422177, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.34204102, + "step": 791, + "time_per_iteration": 2.947608470916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133841, + "balance_loss_mlp": 1.0997231, + "epoch": 0.15236629472874183, + "flos": 643873342464.0, + "grad_norm": 0.06413295296627212, + "language_loss": 0.86589968, + "learning_rate": 0.0009612512760327879, + "loss": 0.87723809, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.34155273, + "step": 792, + "time_per_iteration": 2.8261189460754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124449, + "balance_loss_mlp": 1.08727932, + "epoch": 0.1525586764140054, + "flos": 412654791168.0, + "grad_norm": 0.06095846853214657, + "language_loss": 0.85749042, + "learning_rate": 0.0009611309339802909, + "loss": 0.86873484, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.37182617, + "step": 793, + "time_per_iteration": 2.438474178314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113041, + "balance_loss_mlp": 1.07811236, + "epoch": 0.15275105809926895, + "flos": 802444644864.0, + "grad_norm": 0.04691390558901254, + "language_loss": 0.84620011, + "learning_rate": 0.0009610104129008881, + "loss": 0.85733056, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.34985352, + "step": 794, + "time_per_iteration": 3.1149892807006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092099, + "balance_loss_mlp": 1.05786228, + "epoch": 0.1529434397845325, + "flos": 612143115264.0, + "grad_norm": 0.06446455819394356, + "language_loss": 0.88995111, + "learning_rate": 0.0009608897128413701, + "loss": 0.90087205, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.3425293, + "step": 795, + "time_per_iteration": 2.7310965061187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085841, + "balance_loss_mlp": 1.05160367, + "epoch": 0.15313582146979607, + "flos": 614941009920.0, + "grad_norm": 0.04580320827636504, + "language_loss": 0.8595438, + "learning_rate": 0.0009607688338485965, + "loss": 0.87040222, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.3425293, + "step": 796, + "time_per_iteration": 2.8534584045410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088916, + "balance_loss_mlp": 1.05427384, + "epoch": 0.15332820315505963, + "flos": 793256440320.0, + "grad_norm": 0.053101967265095064, + "language_loss": 0.91128695, + "learning_rate": 0.0009606477759694969, + "loss": 0.92217612, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.34643555, + "step": 797, + "time_per_iteration": 3.0544466972351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098724, + "balance_loss_mlp": 1.06441545, + "epoch": 0.1535205848403232, + "flos": 549945510912.0, + "grad_norm": 0.0662794157411924, + "language_loss": 0.87591946, + "learning_rate": 0.0009605265392510703, + "loss": 0.88690674, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.34350586, + "step": 798, + "time_per_iteration": 2.6120660305023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011091, + "balance_loss_mlp": 1.07417202, + "epoch": 0.15371296652558677, + "flos": 535691045376.0, + "grad_norm": 0.07220239734969772, + "language_loss": 0.92342889, + "learning_rate": 0.0009604051237403846, + "loss": 0.93451989, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.34960938, + "step": 799, + "time_per_iteration": 2.640749216079712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111728, + "balance_loss_mlp": 1.07808757, + "epoch": 0.15390534821085033, + "flos": 395002939392.0, + "grad_norm": 0.06314402273456009, + "language_loss": 0.86126584, + "learning_rate": 0.0009602835294845776, + "loss": 0.87238312, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.33666992, + "step": 800, + "time_per_iteration": 2.44914174079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117351, + "balance_loss_mlp": 1.08254242, + "epoch": 0.1540977298961139, + "flos": 535587738624.0, + "grad_norm": 0.057636094576239, + "language_loss": 0.91100746, + "learning_rate": 0.0009601617565308565, + "loss": 0.92218101, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.34790039, + "step": 801, + "time_per_iteration": 2.599679470062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119062, + "balance_loss_mlp": 1.08511138, + "epoch": 0.15429011158137745, + "flos": 723388147200.0, + "grad_norm": 0.05961266019354579, + "language_loss": 0.86783326, + "learning_rate": 0.0009600398049264977, + "loss": 0.87902391, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.33935547, + "step": 802, + "time_per_iteration": 2.9514007568359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121787, + "balance_loss_mlp": 1.08735943, + "epoch": 0.154482493266641, + "flos": 620209502208.0, + "grad_norm": 0.06366105456569557, + "language_loss": 0.92098475, + "learning_rate": 0.0009599176747188469, + "loss": 0.9322027, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.34448242, + "step": 803, + "time_per_iteration": 2.8068411350250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114024, + "balance_loss_mlp": 1.08012128, + "epoch": 0.15467487495190457, + "flos": 525351909888.0, + "grad_norm": 0.08101366702111423, + "language_loss": 0.83651662, + "learning_rate": 0.0009597953659553196, + "loss": 0.84765685, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.33911133, + "step": 804, + "time_per_iteration": 2.7075448036193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098377, + "balance_loss_mlp": 1.06616712, + "epoch": 0.15486725663716813, + "flos": 527473299456.0, + "grad_norm": 0.07377431927286832, + "language_loss": 0.89624304, + "learning_rate": 0.0009596728786833997, + "loss": 0.90722686, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.32202148, + "step": 805, + "time_per_iteration": 2.6376051902770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088085, + "balance_loss_mlp": 1.05420554, + "epoch": 0.1550596383224317, + "flos": 1048118482944.0, + "grad_norm": 0.06708822771662253, + "language_loss": 0.90018022, + "learning_rate": 0.0009595502129506415, + "loss": 0.91106105, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.33911133, + "step": 806, + "time_per_iteration": 3.3391284942626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092582, + "balance_loss_mlp": 1.05903625, + "epoch": 0.15525202000769528, + "flos": 613438050816.0, + "grad_norm": 0.06052700763637142, + "language_loss": 0.83084035, + "learning_rate": 0.0009594273688046678, + "loss": 0.84176612, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.33544922, + "step": 807, + "time_per_iteration": 2.7136006355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088184, + "balance_loss_mlp": 1.05273128, + "epoch": 0.15544440169295884, + "flos": 532805810688.0, + "grad_norm": 0.07048562468234597, + "language_loss": 0.86048424, + "learning_rate": 0.000959304346293171, + "loss": 0.87136608, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.35473633, + "step": 808, + "time_per_iteration": 2.6744906902313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097573, + "balance_loss_mlp": 1.06254935, + "epoch": 0.1556367833782224, + "flos": 644433546240.0, + "grad_norm": 0.06803397985071584, + "language_loss": 0.88331544, + "learning_rate": 0.0009591811454639125, + "loss": 0.89429116, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.3503418, + "step": 809, + "time_per_iteration": 2.730431079864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.0610261, + "epoch": 0.15582916506348596, + "flos": 543540644352.0, + "grad_norm": 0.06204685505428811, + "language_loss": 0.88227659, + "learning_rate": 0.0009590577663647234, + "loss": 0.89322132, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.3347168, + "step": 810, + "time_per_iteration": 2.71469783782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104684, + "balance_loss_mlp": 1.07078123, + "epoch": 0.15602154674874952, + "flos": 579740613120.0, + "grad_norm": 0.05672341894910533, + "language_loss": 0.86610442, + "learning_rate": 0.0009589342090435036, + "loss": 0.87715125, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.33935547, + "step": 811, + "time_per_iteration": 2.799246072769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110387, + "balance_loss_mlp": 1.06918025, + "epoch": 0.15621392843401308, + "flos": 534982454784.0, + "grad_norm": 0.0647852675732537, + "language_loss": 0.87778354, + "learning_rate": 0.0009588104735482223, + "loss": 0.8888222, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.34692383, + "step": 812, + "time_per_iteration": 2.6684510707855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126921, + "balance_loss_mlp": 1.09106326, + "epoch": 0.15640631011927664, + "flos": 550635162624.0, + "grad_norm": 0.08222618986335321, + "language_loss": 0.84280443, + "learning_rate": 0.0009586865599269177, + "loss": 0.85407358, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.35864258, + "step": 813, + "time_per_iteration": 2.6293816566467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131277, + "balance_loss_mlp": 1.09651566, + "epoch": 0.1565986918045402, + "flos": 637190641152.0, + "grad_norm": 0.05945515562529824, + "language_loss": 0.88725412, + "learning_rate": 0.0009585624682276977, + "loss": 0.89856684, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.34814453, + "step": 814, + "time_per_iteration": 2.744253158569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137563, + "balance_loss_mlp": 1.10113239, + "epoch": 0.15679107348980378, + "flos": 490569122304.0, + "grad_norm": 0.09591637295165127, + "language_loss": 0.87945771, + "learning_rate": 0.0009584381984987386, + "loss": 0.89083332, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.36474609, + "step": 815, + "time_per_iteration": 2.5264036655426025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124613, + "balance_loss_mlp": 1.0911386, + "epoch": 0.15698345517506734, + "flos": 529689231360.0, + "grad_norm": 0.05838460881618622, + "language_loss": 0.90277314, + "learning_rate": 0.0009583137507882864, + "loss": 0.91401929, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.3347168, + "step": 816, + "time_per_iteration": 2.6488330364227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109898, + "balance_loss_mlp": 1.07418323, + "epoch": 0.1571758368603309, + "flos": 545779897344.0, + "grad_norm": 0.07313796537718548, + "language_loss": 0.81262791, + "learning_rate": 0.000958189125144656, + "loss": 0.82372689, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.35766602, + "step": 817, + "time_per_iteration": 2.7040657997131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101746, + "balance_loss_mlp": 1.06672239, + "epoch": 0.15736821854559446, + "flos": 565377048576.0, + "grad_norm": 0.067694528538076, + "language_loss": 0.88558215, + "learning_rate": 0.0009580643216162313, + "loss": 0.89659959, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.3503418, + "step": 818, + "time_per_iteration": 2.6538634300231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096156, + "balance_loss_mlp": 1.06110835, + "epoch": 0.15756060023085802, + "flos": 500707436544.0, + "grad_norm": 0.05957146674366314, + "language_loss": 0.79884583, + "learning_rate": 0.0009579393402514652, + "loss": 0.80980736, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.35107422, + "step": 819, + "time_per_iteration": 2.5606625080108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082975, + "balance_loss_mlp": 1.048738, + "epoch": 0.15775298191612158, + "flos": 519014034432.0, + "grad_norm": 0.06194437160070725, + "language_loss": 0.91126758, + "learning_rate": 0.0009578141810988801, + "loss": 0.92209733, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.34228516, + "step": 820, + "time_per_iteration": 2.55538010597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082194, + "balance_loss_mlp": 1.04712272, + "epoch": 0.15794536360138514, + "flos": 465891153408.0, + "grad_norm": 0.060184436438788555, + "language_loss": 0.91010749, + "learning_rate": 0.0009576888442070668, + "loss": 0.92092943, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.35083008, + "step": 821, + "time_per_iteration": 2.6139276027679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094225, + "balance_loss_mlp": 1.05982161, + "epoch": 0.1581377452866487, + "flos": 516911583744.0, + "grad_norm": 0.06832586535724347, + "language_loss": 0.92820144, + "learning_rate": 0.0009575633296246854, + "loss": 0.93914366, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.34423828, + "step": 822, + "time_per_iteration": 2.557404041290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096972, + "balance_loss_mlp": 1.06242526, + "epoch": 0.15833012697191226, + "flos": 549522109440.0, + "grad_norm": 0.06257557491721027, + "language_loss": 0.83520567, + "learning_rate": 0.0009574376374004652, + "loss": 0.84617537, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.34570312, + "step": 823, + "time_per_iteration": 2.673220157623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100077, + "balance_loss_mlp": 1.06395626, + "epoch": 0.15852250865717585, + "flos": 487206641664.0, + "grad_norm": 0.07116075590187526, + "language_loss": 0.81073487, + "learning_rate": 0.000957311767583204, + "loss": 0.82173562, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.36132812, + "step": 824, + "time_per_iteration": 2.605074882507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159484, + "balance_loss_mlp": 1.14126849, + "epoch": 0.1587148903424394, + "flos": 1309041672192.0, + "grad_norm": 0.051809649393169656, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83231074, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.18261719, + "step": 825, + "time_per_iteration": 4.726073265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111019, + "balance_loss_mlp": 1.07349157, + "epoch": 0.15890727202770297, + "flos": 466634649600.0, + "grad_norm": 0.07947222616221912, + "language_loss": 0.92132723, + "learning_rate": 0.0009570594953650961, + "loss": 0.93243748, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.37524414, + "step": 826, + "time_per_iteration": 2.5146830081939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109067, + "balance_loss_mlp": 1.07208848, + "epoch": 0.15909965371296653, + "flos": 776733608448.0, + "grad_norm": 0.06013225990958685, + "language_loss": 0.80852252, + "learning_rate": 0.00095693309306219, + "loss": 0.81961316, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.36962891, + "step": 827, + "time_per_iteration": 3.095632553100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117102, + "balance_loss_mlp": 1.07945621, + "epoch": 0.1592920353982301, + "flos": 1077852538368.0, + "grad_norm": 0.05984978885312211, + "language_loss": 0.88600951, + "learning_rate": 0.0009568065133621244, + "loss": 0.89718056, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.37646484, + "step": 828, + "time_per_iteration": 3.3153574466705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111269, + "balance_loss_mlp": 1.07584, + "epoch": 0.15948441708349365, + "flos": 725307305472.0, + "grad_norm": 0.0632864692280333, + "language_loss": 0.85493571, + "learning_rate": 0.0009566797563140422, + "loss": 0.86604846, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.35449219, + "step": 829, + "time_per_iteration": 2.8705785274505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121543, + "balance_loss_mlp": 1.08470702, + "epoch": 0.1596767987687572, + "flos": 578447087616.0, + "grad_norm": 0.06433687205870958, + "language_loss": 0.88630873, + "learning_rate": 0.0009565528219671547, + "loss": 0.89752412, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.36816406, + "step": 830, + "time_per_iteration": 2.8890771865844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137333, + "balance_loss_mlp": 1.10049748, + "epoch": 0.15986918045402077, + "flos": 528728947200.0, + "grad_norm": 0.04994246668943954, + "language_loss": 0.85232639, + "learning_rate": 0.0009564257103707418, + "loss": 0.86369967, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.36816406, + "step": 831, + "time_per_iteration": 2.5870308876037598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133852, + "balance_loss_mlp": 1.09632492, + "epoch": 0.16006156213928435, + "flos": 574313559552.0, + "grad_norm": 0.0648316290803925, + "language_loss": 0.91675746, + "learning_rate": 0.0009562984215741533, + "loss": 0.92809594, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.37524414, + "step": 832, + "time_per_iteration": 2.655066967010498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117496, + "balance_loss_mlp": 1.08170903, + "epoch": 0.1602539438245479, + "flos": 515258675712.0, + "grad_norm": 0.14271195523272245, + "language_loss": 0.82911491, + "learning_rate": 0.0009561709556268065, + "loss": 0.84028995, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.35839844, + "step": 833, + "time_per_iteration": 2.69999098777771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119914, + "balance_loss_mlp": 1.08419931, + "epoch": 0.16044632550981147, + "flos": 620730418176.0, + "grad_norm": 0.05962773238435596, + "language_loss": 0.95060706, + "learning_rate": 0.0009560433125781884, + "loss": 0.96180618, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.35693359, + "step": 834, + "time_per_iteration": 2.711109161376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126421, + "balance_loss_mlp": 1.08977628, + "epoch": 0.16063870719507503, + "flos": 560817146880.0, + "grad_norm": 0.06388697234939344, + "language_loss": 0.92829657, + "learning_rate": 0.0009559154924778544, + "loss": 0.93956077, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.36621094, + "step": 835, + "time_per_iteration": 2.695260763168335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121916, + "balance_loss_mlp": 1.08789361, + "epoch": 0.1608310888803386, + "flos": 804778440192.0, + "grad_norm": 0.05750453212643973, + "language_loss": 0.85217482, + "learning_rate": 0.0009557874953754284, + "loss": 0.86339402, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.34057617, + "step": 836, + "time_per_iteration": 3.002013921737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126166, + "balance_loss_mlp": 1.09204817, + "epoch": 0.16102347056560215, + "flos": 600311195136.0, + "grad_norm": 0.06332628409766573, + "language_loss": 0.84060842, + "learning_rate": 0.0009556593213206038, + "loss": 0.85187006, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.34130859, + "step": 837, + "time_per_iteration": 2.698716163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125003, + "balance_loss_mlp": 1.09102869, + "epoch": 0.1612158522508657, + "flos": 553235208192.0, + "grad_norm": 0.07524747482874264, + "language_loss": 0.87844718, + "learning_rate": 0.0009555309703631414, + "loss": 0.88969719, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.33984375, + "step": 838, + "time_per_iteration": 2.669588327407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133813, + "balance_loss_mlp": 1.09752607, + "epoch": 0.16140823393612927, + "flos": 555701423616.0, + "grad_norm": 0.07144746672945328, + "language_loss": 0.87685311, + "learning_rate": 0.0009554024425528722, + "loss": 0.88819122, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.36279297, + "step": 839, + "time_per_iteration": 2.6809709072113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112028, + "balance_loss_mlp": 1.08737814, + "epoch": 0.16160061562139286, + "flos": 543613427712.0, + "grad_norm": 0.06970106087394082, + "language_loss": 0.8929134, + "learning_rate": 0.0009552737379396948, + "loss": 0.90411627, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.32885742, + "step": 840, + "time_per_iteration": 2.6100995540618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102587, + "balance_loss_mlp": 1.06920815, + "epoch": 0.16179299730665642, + "flos": 603590717952.0, + "grad_norm": 0.06131687325166246, + "language_loss": 0.87945604, + "learning_rate": 0.0009551448565735767, + "loss": 0.89048195, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.33398438, + "step": 841, + "time_per_iteration": 2.7360360622406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095736, + "balance_loss_mlp": 1.06168985, + "epoch": 0.16198537899191998, + "flos": 786821050368.0, + "grad_norm": 0.07162496841720159, + "language_loss": 0.8519845, + "learning_rate": 0.0009550157985045543, + "loss": 0.86294186, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.34082031, + "step": 842, + "time_per_iteration": 3.0436456203460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087632, + "balance_loss_mlp": 1.05413389, + "epoch": 0.16217776067718354, + "flos": 519550917120.0, + "grad_norm": 0.060562390499230526, + "language_loss": 0.89622426, + "learning_rate": 0.0009548865637827321, + "loss": 0.90710062, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.33496094, + "step": 843, + "time_per_iteration": 2.6422221660614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086448, + "balance_loss_mlp": 1.05342698, + "epoch": 0.1623701423624471, + "flos": 505015644672.0, + "grad_norm": 0.07097995853224412, + "language_loss": 0.90216166, + "learning_rate": 0.0009547571524582838, + "loss": 0.91302609, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.33032227, + "step": 844, + "time_per_iteration": 2.6082894802093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095057, + "balance_loss_mlp": 1.06031966, + "epoch": 0.16256252404771065, + "flos": 496940493312.0, + "grad_norm": 0.06932052947515681, + "language_loss": 0.92511153, + "learning_rate": 0.0009546275645814512, + "loss": 0.9360621, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.34765625, + "step": 845, + "time_per_iteration": 2.5985872745513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100013, + "balance_loss_mlp": 1.065418, + "epoch": 0.16275490573297421, + "flos": 502110061056.0, + "grad_norm": 0.07540183512891604, + "language_loss": 0.90294898, + "learning_rate": 0.0009544978002025446, + "loss": 0.91394913, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.34619141, + "step": 846, + "time_per_iteration": 2.5778391361236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096289, + "balance_loss_mlp": 1.06174231, + "epoch": 0.16294728741823777, + "flos": 506952179712.0, + "grad_norm": 0.06018935314915502, + "language_loss": 0.87532055, + "learning_rate": 0.0009543678593719434, + "loss": 0.8862834, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.34570312, + "step": 847, + "time_per_iteration": 2.697566270828247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102624, + "balance_loss_mlp": 1.06824434, + "epoch": 0.16313966910350133, + "flos": 509418395136.0, + "grad_norm": 0.054217985504269955, + "language_loss": 0.8754853, + "learning_rate": 0.0009542377421400945, + "loss": 0.88651162, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.34375, + "step": 848, + "time_per_iteration": 2.786766290664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104457, + "balance_loss_mlp": 1.06847942, + "epoch": 0.16333205078876492, + "flos": 543712352256.0, + "grad_norm": 0.06122856356214084, + "language_loss": 0.83524954, + "learning_rate": 0.0009541074485575145, + "loss": 0.84629411, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.35986328, + "step": 849, + "time_per_iteration": 2.713759183883667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098701, + "balance_loss_mlp": 1.06346297, + "epoch": 0.16352443247402848, + "flos": 507477477888.0, + "grad_norm": 0.06331477383231503, + "language_loss": 0.92240757, + "learning_rate": 0.0009539769786747874, + "loss": 0.93339461, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.35253906, + "step": 850, + "time_per_iteration": 2.589945077896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100894, + "balance_loss_mlp": 1.06584692, + "epoch": 0.16371681415929204, + "flos": 541851420672.0, + "grad_norm": 0.06704648725492578, + "language_loss": 0.81567919, + "learning_rate": 0.0009538463325425665, + "loss": 0.82668811, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.35083008, + "step": 851, + "time_per_iteration": 2.6779844760894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105544, + "balance_loss_mlp": 1.07042515, + "epoch": 0.1639091958445556, + "flos": 520501026816.0, + "grad_norm": 0.058426853420895056, + "language_loss": 0.8673842, + "learning_rate": 0.0009537155102115728, + "loss": 0.87843966, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.35131836, + "step": 852, + "time_per_iteration": 2.5614206790924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106136, + "balance_loss_mlp": 1.07175565, + "epoch": 0.16410157752981916, + "flos": 547149026304.0, + "grad_norm": 0.06460558975646845, + "language_loss": 0.83482397, + "learning_rate": 0.0009535845117325961, + "loss": 0.84588534, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.34423828, + "step": 853, + "time_per_iteration": 2.6453073024749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098632, + "balance_loss_mlp": 1.06470513, + "epoch": 0.16429395921508272, + "flos": 582561828864.0, + "grad_norm": 0.052152281018199936, + "language_loss": 0.93584174, + "learning_rate": 0.0009534533371564946, + "loss": 0.94682807, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.33959961, + "step": 854, + "time_per_iteration": 2.75186824798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111206, + "balance_loss_mlp": 1.07670665, + "epoch": 0.16448634090034628, + "flos": 530678628864.0, + "grad_norm": 0.06475772966833339, + "language_loss": 0.8907218, + "learning_rate": 0.0009533219865341949, + "loss": 0.90183383, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.3449707, + "step": 855, + "time_per_iteration": 2.581479787826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108852, + "balance_loss_mlp": 1.07285094, + "epoch": 0.16467872258560984, + "flos": 491623948800.0, + "grad_norm": 0.06378602693040462, + "language_loss": 0.87287533, + "learning_rate": 0.0009531904599166916, + "loss": 0.88396388, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.36035156, + "step": 856, + "time_per_iteration": 2.6429831981658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107008, + "balance_loss_mlp": 1.07141232, + "epoch": 0.16487110427087343, + "flos": 506015216640.0, + "grad_norm": 0.07162133431974482, + "language_loss": 0.85139728, + "learning_rate": 0.0009530587573550478, + "loss": 0.86246729, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.35620117, + "step": 857, + "time_per_iteration": 2.5667338371276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071319, + "balance_loss_mlp": 1.05434394, + "epoch": 0.16506348595613698, + "flos": 1432006553088.0, + "grad_norm": 0.02717136097410494, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75390708, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.16992188, + "step": 858, + "time_per_iteration": 5.02930474281311 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107487, + "balance_loss_mlp": 1.0740366, + "epoch": 0.16525586764140054, + "flos": 476890827264.0, + "grad_norm": 0.06438670275364486, + "language_loss": 0.90481895, + "learning_rate": 0.0009527948246039337, + "loss": 0.91589379, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.33447266, + "step": 859, + "time_per_iteration": 2.5222055912017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109782, + "balance_loss_mlp": 1.07618856, + "epoch": 0.1654482493266641, + "flos": 880737297408.0, + "grad_norm": 0.058857893791213665, + "language_loss": 0.88361865, + "learning_rate": 0.000952662594516931, + "loss": 0.8947165, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.33618164, + "step": 860, + "time_per_iteration": 3.065053701400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109979, + "balance_loss_mlp": 1.07497942, + "epoch": 0.16564063101192766, + "flos": 626527028736.0, + "grad_norm": 0.058557043780191484, + "language_loss": 0.86803752, + "learning_rate": 0.0009525301886907234, + "loss": 0.87913728, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.34985352, + "step": 861, + "time_per_iteration": 2.873415470123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121945, + "balance_loss_mlp": 1.08537149, + "epoch": 0.16583301269719122, + "flos": 561250722816.0, + "grad_norm": 0.0761086770239273, + "language_loss": 0.8825953, + "learning_rate": 0.0009523976071767155, + "loss": 0.8938148, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.36572266, + "step": 862, + "time_per_iteration": 2.71508526802063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115561, + "balance_loss_mlp": 1.07994115, + "epoch": 0.16602539438245478, + "flos": 567510022656.0, + "grad_norm": 0.05388299317844869, + "language_loss": 0.88433009, + "learning_rate": 0.00095226485002638, + "loss": 0.8954857, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.35620117, + "step": 863, + "time_per_iteration": 2.7524497509002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111173, + "balance_loss_mlp": 1.07617354, + "epoch": 0.16621777606771834, + "flos": 574589984256.0, + "grad_norm": 0.05833582522103205, + "language_loss": 0.89311475, + "learning_rate": 0.0009521319172912576, + "loss": 0.90422642, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.35009766, + "step": 864, + "time_per_iteration": 2.717493772506714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134798, + "balance_loss_mlp": 1.09846306, + "epoch": 0.16641015775298193, + "flos": 514292599296.0, + "grad_norm": 0.05644176285984134, + "language_loss": 0.94990546, + "learning_rate": 0.0009519988090229579, + "loss": 0.96125346, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.36352539, + "step": 865, + "time_per_iteration": 2.6850624084472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133116, + "balance_loss_mlp": 1.09668565, + "epoch": 0.1666025394382455, + "flos": 621395338752.0, + "grad_norm": 0.05816643645022503, + "language_loss": 0.88684535, + "learning_rate": 0.0009518655252731576, + "loss": 0.89817655, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.36450195, + "step": 866, + "time_per_iteration": 2.7240021228790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124082, + "balance_loss_mlp": 1.0882715, + "epoch": 0.16679492112350905, + "flos": 548528329728.0, + "grad_norm": 0.06128727898968579, + "language_loss": 0.9070124, + "learning_rate": 0.0009517320660936022, + "loss": 0.91825324, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.35839844, + "step": 867, + "time_per_iteration": 2.6959731578826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118134, + "balance_loss_mlp": 1.08260965, + "epoch": 0.1669873028087726, + "flos": 665379477504.0, + "grad_norm": 0.05857722537468161, + "language_loss": 0.83557463, + "learning_rate": 0.0009515984315361051, + "loss": 0.84675598, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.35546875, + "step": 868, + "time_per_iteration": 2.813674211502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122458, + "balance_loss_mlp": 1.08638549, + "epoch": 0.16717968449403617, + "flos": 538305647616.0, + "grad_norm": 0.06553445455365839, + "language_loss": 0.87103701, + "learning_rate": 0.000951464621652548, + "loss": 0.88226151, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.36083984, + "step": 869, + "time_per_iteration": 2.674333333969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111253, + "balance_loss_mlp": 1.07757819, + "epoch": 0.16737206617929973, + "flos": 529833235968.0, + "grad_norm": 0.059309523866322815, + "language_loss": 0.78951609, + "learning_rate": 0.0009513306364948804, + "loss": 0.80064136, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.34985352, + "step": 870, + "time_per_iteration": 2.7519431114196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011143, + "balance_loss_mlp": 1.07953846, + "epoch": 0.1675644478645633, + "flos": 480529732608.0, + "grad_norm": 0.06711563999134491, + "language_loss": 0.89559376, + "learning_rate": 0.0009511964761151197, + "loss": 0.90673673, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.34814453, + "step": 871, + "time_per_iteration": 2.544520854949951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113298, + "balance_loss_mlp": 1.07820272, + "epoch": 0.16775682954982685, + "flos": 494311334400.0, + "grad_norm": 0.06484202096701225, + "language_loss": 0.9050945, + "learning_rate": 0.0009510621405653521, + "loss": 0.91622752, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.35131836, + "step": 872, + "time_per_iteration": 2.594224452972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106918, + "balance_loss_mlp": 1.07265687, + "epoch": 0.1679492112350904, + "flos": 751694846976.0, + "grad_norm": 0.060317450015561574, + "language_loss": 0.847211, + "learning_rate": 0.0009509276298977309, + "loss": 0.85828018, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.34277344, + "step": 873, + "time_per_iteration": 2.9428915977478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110568, + "balance_loss_mlp": 1.07261181, + "epoch": 0.168141592920354, + "flos": 1135413075456.0, + "grad_norm": 0.05441785661992682, + "language_loss": 0.81867516, + "learning_rate": 0.0009507929441644778, + "loss": 0.82978088, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.37939453, + "step": 874, + "time_per_iteration": 3.52008318901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101336, + "balance_loss_mlp": 1.06640816, + "epoch": 0.16833397460561755, + "flos": 632114205696.0, + "grad_norm": 0.06557720885733571, + "language_loss": 0.86201179, + "learning_rate": 0.0009506580834178826, + "loss": 0.87302518, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.34936523, + "step": 875, + "time_per_iteration": 2.7744014263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110472, + "balance_loss_mlp": 1.06817079, + "epoch": 0.1685263562908811, + "flos": 541171943424.0, + "grad_norm": 0.06007828909359903, + "language_loss": 0.91612709, + "learning_rate": 0.0009505230477103028, + "loss": 0.92717427, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.36547852, + "step": 876, + "time_per_iteration": 2.6593635082244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103279, + "balance_loss_mlp": 1.06703997, + "epoch": 0.16871873797614467, + "flos": 619036812288.0, + "grad_norm": 0.08702038824672748, + "language_loss": 0.81312418, + "learning_rate": 0.0009503878370941641, + "loss": 0.824157, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.36206055, + "step": 877, + "time_per_iteration": 2.7511024475097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094199, + "balance_loss_mlp": 1.05986643, + "epoch": 0.16891111966140823, + "flos": 606067107840.0, + "grad_norm": 0.06953183101172467, + "language_loss": 0.88841844, + "learning_rate": 0.0009502524516219595, + "loss": 0.89936042, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.34375, + "step": 878, + "time_per_iteration": 2.697455406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091575, + "balance_loss_mlp": 1.05757689, + "epoch": 0.1691035013466718, + "flos": 552058136064.0, + "grad_norm": 0.0721678347454753, + "language_loss": 0.89980447, + "learning_rate": 0.0009501168913462506, + "loss": 0.91072023, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.34008789, + "step": 879, + "time_per_iteration": 2.6825287342071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080465, + "balance_loss_mlp": 1.06263125, + "epoch": 0.16929588303193535, + "flos": 1475544121344.0, + "grad_norm": 0.044515803528062385, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80202389, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.17871094, + "step": 880, + "time_per_iteration": 4.825777769088745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081319, + "balance_loss_mlp": 1.0464623, + "epoch": 0.1694882647171989, + "flos": 925850456064.0, + "grad_norm": 0.06491790696384477, + "language_loss": 0.85360616, + "learning_rate": 0.0009498452465949042, + "loss": 0.86441934, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.34887695, + "step": 881, + "time_per_iteration": 3.2700376510620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086319, + "balance_loss_mlp": 1.05227244, + "epoch": 0.1696806464024625, + "flos": 545829359616.0, + "grad_norm": 0.057533624801199786, + "language_loss": 0.916857, + "learning_rate": 0.0009497091622247285, + "loss": 0.92772019, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.34082031, + "step": 882, + "time_per_iteration": 2.711721181869507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085559, + "balance_loss_mlp": 1.05184698, + "epoch": 0.16987302808772606, + "flos": 528970466304.0, + "grad_norm": 0.08384615451013337, + "language_loss": 0.93744707, + "learning_rate": 0.0009495729032619723, + "loss": 0.94830269, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.33740234, + "step": 883, + "time_per_iteration": 2.688525438308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084621, + "balance_loss_mlp": 1.05062199, + "epoch": 0.17006540977298962, + "flos": 754855096320.0, + "grad_norm": 0.06073677328113264, + "language_loss": 0.84419179, + "learning_rate": 0.0009494364697595354, + "loss": 0.85503805, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.34033203, + "step": 884, + "time_per_iteration": 2.9112000465393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092434, + "balance_loss_mlp": 1.05750597, + "epoch": 0.17025779145825318, + "flos": 558532813824.0, + "grad_norm": 0.06728326387015754, + "language_loss": 0.89818925, + "learning_rate": 0.0009492998617703867, + "loss": 0.90911365, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.34936523, + "step": 885, + "time_per_iteration": 2.6760926246643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093981, + "balance_loss_mlp": 1.06045985, + "epoch": 0.17045017314351674, + "flos": 511963186176.0, + "grad_norm": 0.0687386743468794, + "language_loss": 0.87971282, + "learning_rate": 0.0009491630793475619, + "loss": 0.89065266, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.33520508, + "step": 886, + "time_per_iteration": 2.59726619720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.06011951, + "epoch": 0.1706425548287803, + "flos": 508674898944.0, + "grad_norm": 0.058204707286146434, + "language_loss": 0.85722017, + "learning_rate": 0.0009490261225441643, + "loss": 0.8681649, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.34350586, + "step": 887, + "time_per_iteration": 2.900501012802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087689, + "balance_loss_mlp": 1.0545013, + "epoch": 0.17083493651404386, + "flos": 717016776192.0, + "grad_norm": 0.05310353290702558, + "language_loss": 0.90775931, + "learning_rate": 0.0009488889914133656, + "loss": 0.9186362, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.33203125, + "step": 888, + "time_per_iteration": 2.992532968521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089684, + "balance_loss_mlp": 1.05520868, + "epoch": 0.17102731819930742, + "flos": 558852908544.0, + "grad_norm": 0.047287767355612194, + "language_loss": 0.88680297, + "learning_rate": 0.0009487516860084047, + "loss": 0.89769983, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.34472656, + "step": 889, + "time_per_iteration": 2.7029643058776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082798, + "balance_loss_mlp": 1.04858518, + "epoch": 0.17121969988457098, + "flos": 494542679040.0, + "grad_norm": 0.0765590367769256, + "language_loss": 0.88680983, + "learning_rate": 0.0009486142063825884, + "loss": 0.89763772, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.34228516, + "step": 890, + "time_per_iteration": 2.5640931129455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038716, + "balance_loss_mlp": 1.02402985, + "epoch": 0.17141208156983456, + "flos": 1548088063488.0, + "grad_norm": 0.02832238153451814, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.7346493, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.14648438, + "step": 891, + "time_per_iteration": 4.979609251022339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108867, + "balance_loss_mlp": 1.0540278, + "epoch": 0.17160446325509812, + "flos": 619282713600.0, + "grad_norm": 0.06449268303648867, + "language_loss": 0.90758598, + "learning_rate": 0.0009483387246819542, + "loss": 0.91847265, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.34667969, + "step": 892, + "time_per_iteration": 2.731332540512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_mlp": 1.01767898, + "epoch": 0.17179684494036168, + "flos": 1381026972672.0, + "grad_norm": 0.016720826063608682, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83318138, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.1484375, + "step": 893, + "time_per_iteration": 4.641844987869263 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097484, + "balance_loss_mlp": 1.06386662, + "epoch": 0.17198922662562524, + "flos": 492386383872.0, + "grad_norm": 0.05711411270468228, + "language_loss": 0.89587665, + "learning_rate": 0.0009480625467392688, + "loss": 0.90685147, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.33642578, + "step": 894, + "time_per_iteration": 2.6310250759124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033123, + "balance_loss_mlp": 1.01795936, + "epoch": 0.1721816083108888, + "flos": 1457529914880.0, + "grad_norm": 0.013728573618451478, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79027796, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.15136719, + "step": 895, + "time_per_iteration": 4.7525646686553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132534, + "balance_loss_mlp": 1.09834456, + "epoch": 0.17237398999615236, + "flos": 527853030912.0, + "grad_norm": 0.05821127752563967, + "language_loss": 0.87793648, + "learning_rate": 0.0009477856729834196, + "loss": 0.88926184, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.34228516, + "step": 896, + "time_per_iteration": 2.7015438079833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132108, + "balance_loss_mlp": 1.09901524, + "epoch": 0.17256637168141592, + "flos": 603644562432.0, + "grad_norm": 0.08337200045302615, + "language_loss": 0.9056648, + "learning_rate": 0.0009476469753098809, + "loss": 0.91698587, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.33105469, + "step": 897, + "time_per_iteration": 2.7035813331604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125108, + "balance_loss_mlp": 1.08922589, + "epoch": 0.17275875336667948, + "flos": 509437334016.0, + "grad_norm": 0.05742024530278536, + "language_loss": 0.874506, + "learning_rate": 0.0009475081038443738, + "loss": 0.88575709, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.35913086, + "step": 898, + "time_per_iteration": 2.584437370300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115669, + "balance_loss_mlp": 1.07971573, + "epoch": 0.17295113505194307, + "flos": 664951693824.0, + "grad_norm": 0.06535241228499304, + "language_loss": 0.85809892, + "learning_rate": 0.0009473690586408124, + "loss": 0.86925566, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.35986328, + "step": 899, + "time_per_iteration": 2.83156418800354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116393, + "balance_loss_mlp": 1.08084452, + "epoch": 0.17314351673720663, + "flos": 555125253120.0, + "grad_norm": 0.0683413775827569, + "language_loss": 0.86639923, + "learning_rate": 0.0009472298397531792, + "loss": 0.87756318, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.35546875, + "step": 900, + "time_per_iteration": 2.6944193840026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123337, + "balance_loss_mlp": 1.08635855, + "epoch": 0.17333589842247019, + "flos": 503361326592.0, + "grad_norm": 0.09670394547256775, + "language_loss": 0.87118709, + "learning_rate": 0.0009470904472355235, + "loss": 0.88242042, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.36987305, + "step": 901, + "time_per_iteration": 2.637882709503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114178, + "balance_loss_mlp": 1.07982159, + "epoch": 0.17352828010773375, + "flos": 555924003840.0, + "grad_norm": 0.06358596699153923, + "language_loss": 0.79912066, + "learning_rate": 0.0009469508811419626, + "loss": 0.81026244, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.34399414, + "step": 902, + "time_per_iteration": 2.726072311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077408, + "balance_loss_mlp": 1.06453359, + "epoch": 0.1737206617929973, + "flos": 1553711556096.0, + "grad_norm": 0.030950293127884103, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72691238, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.12890625, + "step": 903, + "time_per_iteration": 4.791790723800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109546, + "balance_loss_mlp": 1.07445073, + "epoch": 0.17391304347826086, + "flos": 516390667776.0, + "grad_norm": 0.06883251868009001, + "language_loss": 0.84220147, + "learning_rate": 0.0009466712284439292, + "loss": 0.85329694, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.35131836, + "step": 904, + "time_per_iteration": 2.7648017406463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104504, + "balance_loss_mlp": 1.06995738, + "epoch": 0.17410542516352442, + "flos": 540773273088.0, + "grad_norm": 0.06988851938988141, + "language_loss": 0.8903957, + "learning_rate": 0.0009465311419480276, + "loss": 0.90144074, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.34545898, + "step": 905, + "time_per_iteration": 2.725829601287842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098399, + "balance_loss_mlp": 1.0637325, + "epoch": 0.17429780684878798, + "flos": 623542869504.0, + "grad_norm": 0.06312030659776342, + "language_loss": 0.88624233, + "learning_rate": 0.0009463908820933622, + "loss": 0.89722633, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.34692383, + "step": 906, + "time_per_iteration": 2.8389482498168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093859, + "balance_loss_mlp": 1.05900264, + "epoch": 0.17449018853405157, + "flos": 575368386048.0, + "grad_norm": 0.056066721215551084, + "language_loss": 0.83138871, + "learning_rate": 0.0009462504489343868, + "loss": 0.84232736, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.34863281, + "step": 907, + "time_per_iteration": 2.863349437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086469, + "balance_loss_mlp": 1.05199337, + "epoch": 0.17468257021931513, + "flos": 533499844608.0, + "grad_norm": 0.07604190500703253, + "language_loss": 0.894853, + "learning_rate": 0.0009461098425256222, + "loss": 0.90571761, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.3449707, + "step": 908, + "time_per_iteration": 2.5941011905670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108834, + "balance_loss_mlp": 1.05345941, + "epoch": 0.1748749519045787, + "flos": 540496848384.0, + "grad_norm": 0.050694136543679796, + "language_loss": 0.85873353, + "learning_rate": 0.0009459690629216567, + "loss": 0.86961693, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.34887695, + "step": 909, + "time_per_iteration": 2.6097571849823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109032, + "balance_loss_mlp": 1.0570612, + "epoch": 0.17506733358984225, + "flos": 498373641216.0, + "grad_norm": 0.0569262349138849, + "language_loss": 0.88138729, + "learning_rate": 0.0009458281101771457, + "loss": 0.89229047, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.33276367, + "step": 910, + "time_per_iteration": 2.5904784202575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091588, + "balance_loss_mlp": 1.05744696, + "epoch": 0.1752597152751058, + "flos": 622621873152.0, + "grad_norm": 0.06350455217589325, + "language_loss": 0.8266046, + "learning_rate": 0.0009456869843468122, + "loss": 0.83752048, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.34179688, + "step": 911, + "time_per_iteration": 2.8930556774139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090023, + "balance_loss_mlp": 1.05476046, + "epoch": 0.17545209696036937, + "flos": 520717814784.0, + "grad_norm": 0.07844481886152296, + "language_loss": 0.79097009, + "learning_rate": 0.0009455456854854459, + "loss": 0.80187035, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.35302734, + "step": 912, + "time_per_iteration": 2.5984511375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096028, + "balance_loss_mlp": 1.0631026, + "epoch": 0.17564447864563293, + "flos": 461750270976.0, + "grad_norm": 0.05516798292623818, + "language_loss": 0.84505737, + "learning_rate": 0.0009454042136479039, + "loss": 0.85601771, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.3293457, + "step": 913, + "time_per_iteration": 2.586195945739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085981, + "balance_loss_mlp": 1.05286503, + "epoch": 0.1758368603308965, + "flos": 480416251392.0, + "grad_norm": 0.05301404729603274, + "language_loss": 0.83308446, + "learning_rate": 0.0009452625688891103, + "loss": 0.84394431, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.33129883, + "step": 914, + "time_per_iteration": 2.5374929904937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052517, + "balance_loss_mlp": 1.038975, + "epoch": 0.17602924201616005, + "flos": 1478160133632.0, + "grad_norm": 0.03507986977902886, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79787254, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.13574219, + "step": 915, + "time_per_iteration": 4.5561583042144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097389, + "balance_loss_mlp": 1.06226993, + "epoch": 0.17622162370142364, + "flos": 602010593280.0, + "grad_norm": 0.06815502965849334, + "language_loss": 0.93451321, + "learning_rate": 0.0009449787608278015, + "loss": 0.94548714, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.35131836, + "step": 916, + "time_per_iteration": 2.7807908058166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092918, + "balance_loss_mlp": 1.0588007, + "epoch": 0.1764140053866872, + "flos": 442473214464.0, + "grad_norm": 0.0637680644109211, + "language_loss": 0.92700857, + "learning_rate": 0.0009448365976354704, + "loss": 0.93793774, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.34130859, + "step": 917, + "time_per_iteration": 2.4800124168395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105643, + "balance_loss_mlp": 1.06909323, + "epoch": 0.17660638707195075, + "flos": 500362610688.0, + "grad_norm": 0.07080486598597346, + "language_loss": 0.90158784, + "learning_rate": 0.0009446942617422558, + "loss": 0.91264427, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.36547852, + "step": 918, + "time_per_iteration": 2.5415430068969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101469, + "balance_loss_mlp": 1.06766129, + "epoch": 0.17679876875721431, + "flos": 538621360128.0, + "grad_norm": 0.060000223973742446, + "language_loss": 0.86201262, + "learning_rate": 0.0009445517532034176, + "loss": 0.87302732, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.33789062, + "step": 919, + "time_per_iteration": 2.6849868297576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121669, + "balance_loss_mlp": 1.08569145, + "epoch": 0.17699115044247787, + "flos": 497477376000.0, + "grad_norm": 0.08221632690768264, + "language_loss": 0.89522099, + "learning_rate": 0.0009444090720742824, + "loss": 0.9064377, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.35986328, + "step": 920, + "time_per_iteration": 2.6034600734710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113572, + "balance_loss_mlp": 1.07883418, + "epoch": 0.17718353212774143, + "flos": 662444780544.0, + "grad_norm": 0.08029288241638204, + "language_loss": 0.88040781, + "learning_rate": 0.0009442662184102439, + "loss": 0.89154357, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.34741211, + "step": 921, + "time_per_iteration": 2.767352342605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105947, + "balance_loss_mlp": 1.07309294, + "epoch": 0.177375913813005, + "flos": 582340658688.0, + "grad_norm": 0.0705507668945597, + "language_loss": 0.87951338, + "learning_rate": 0.000944123192266763, + "loss": 0.89057279, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.32836914, + "step": 922, + "time_per_iteration": 2.789315700531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108289, + "balance_loss_mlp": 1.0727644, + "epoch": 0.17756829549826855, + "flos": 552285098496.0, + "grad_norm": 0.06115562628552814, + "language_loss": 0.83835006, + "learning_rate": 0.0009439799936993671, + "loss": 0.84943295, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.35546875, + "step": 923, + "time_per_iteration": 2.7160987854003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090669, + "balance_loss_mlp": 1.05733824, + "epoch": 0.17776067718353214, + "flos": 556060806144.0, + "grad_norm": 0.07059184324253498, + "language_loss": 0.88508618, + "learning_rate": 0.0009438366227636511, + "loss": 0.89599288, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.33349609, + "step": 924, + "time_per_iteration": 2.6319191455841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084897, + "balance_loss_mlp": 1.05163789, + "epoch": 0.1779530588687957, + "flos": 658161303552.0, + "grad_norm": 0.06263940487075517, + "language_loss": 0.86677843, + "learning_rate": 0.0009436930795152763, + "loss": 0.87762737, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.33276367, + "step": 925, + "time_per_iteration": 2.8063783645629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084326, + "balance_loss_mlp": 1.05159163, + "epoch": 0.17814544055405926, + "flos": 644187644928.0, + "grad_norm": 0.06448697412821461, + "language_loss": 0.8710525, + "learning_rate": 0.0009435493640099713, + "loss": 0.88189578, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.32739258, + "step": 926, + "time_per_iteration": 2.7599081993103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080787, + "balance_loss_mlp": 1.04664516, + "epoch": 0.17833782223932282, + "flos": 460672123392.0, + "grad_norm": 0.06497730431564504, + "language_loss": 0.84328961, + "learning_rate": 0.0009434054763035314, + "loss": 0.85409749, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.34155273, + "step": 927, + "time_per_iteration": 2.612910032272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081381, + "balance_loss_mlp": 1.04740596, + "epoch": 0.17853020392458638, + "flos": 759212766720.0, + "grad_norm": 0.04594292129212818, + "language_loss": 0.85898727, + "learning_rate": 0.0009432614164518185, + "loss": 0.8698011, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.33984375, + "step": 928, + "time_per_iteration": 2.926981210708618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086846, + "balance_loss_mlp": 1.05153632, + "epoch": 0.17872258560984994, + "flos": 782320785408.0, + "grad_norm": 0.055185850673896385, + "language_loss": 0.84792197, + "learning_rate": 0.000943117184510762, + "loss": 0.85879046, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.35327148, + "step": 929, + "time_per_iteration": 2.995514154434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039861, + "balance_loss_mlp": 1.02660513, + "epoch": 0.1789149672951135, + "flos": 1459095482880.0, + "grad_norm": 0.021362691821678215, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79829824, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.1328125, + "step": 930, + "time_per_iteration": 4.99839448928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091678, + "balance_loss_mlp": 1.05739331, + "epoch": 0.17910734898037706, + "flos": 503598463488.0, + "grad_norm": 0.05761618473313655, + "language_loss": 0.88773429, + "learning_rate": 0.0009428282045846674, + "loss": 0.89865112, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.34301758, + "step": 931, + "time_per_iteration": 2.6966652870178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087828, + "balance_loss_mlp": 1.05452061, + "epoch": 0.17929973066564064, + "flos": 745895264256.0, + "grad_norm": 0.05798282919409206, + "language_loss": 0.89983928, + "learning_rate": 0.0009426834567118214, + "loss": 0.91071755, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.33300781, + "step": 932, + "time_per_iteration": 3.072160482406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092765, + "balance_loss_mlp": 1.05907631, + "epoch": 0.1794921123509042, + "flos": 712875893760.0, + "grad_norm": 0.055390897890994044, + "language_loss": 0.80879378, + "learning_rate": 0.0009425385369740155, + "loss": 0.81972146, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.3371582, + "step": 933, + "time_per_iteration": 3.0337042808532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092731, + "balance_loss_mlp": 1.05825567, + "epoch": 0.17968449403616776, + "flos": 632838763008.0, + "grad_norm": 0.0687685702394307, + "language_loss": 0.87443584, + "learning_rate": 0.0009423934454275125, + "loss": 0.8853631, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.3449707, + "step": 934, + "time_per_iteration": 2.7970879077911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089195, + "balance_loss_mlp": 1.05526757, + "epoch": 0.17987687572143132, + "flos": 536060602368.0, + "grad_norm": 0.08214865293258214, + "language_loss": 0.92215371, + "learning_rate": 0.0009422481821286418, + "loss": 0.93304563, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.33959961, + "step": 935, + "time_per_iteration": 2.7134642601013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091708, + "balance_loss_mlp": 1.05914021, + "epoch": 0.18006925740669488, + "flos": 537818227200.0, + "grad_norm": 0.0718764173736199, + "language_loss": 0.87967253, + "learning_rate": 0.0009421027471337998, + "loss": 0.89058959, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.32568359, + "step": 936, + "time_per_iteration": 2.608764171600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098474, + "balance_loss_mlp": 1.06333113, + "epoch": 0.18026163909195844, + "flos": 539255757312.0, + "grad_norm": 0.06697051800305152, + "language_loss": 0.82882118, + "learning_rate": 0.0009419571404994493, + "loss": 0.83980596, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.3515625, + "step": 937, + "time_per_iteration": 2.620296001434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096306, + "balance_loss_mlp": 1.06240284, + "epoch": 0.180454020777222, + "flos": 500382959616.0, + "grad_norm": 0.08555714620461663, + "language_loss": 0.90948844, + "learning_rate": 0.00094181136228212, + "loss": 0.92045152, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.33935547, + "step": 938, + "time_per_iteration": 2.62837290763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109732, + "balance_loss_mlp": 1.06415629, + "epoch": 0.18064640246248556, + "flos": 498689353728.0, + "grad_norm": 0.06983123921060745, + "language_loss": 0.86323059, + "learning_rate": 0.0009416654125384077, + "loss": 0.8742038, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.33154297, + "step": 939, + "time_per_iteration": 2.715686321258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054723, + "balance_loss_mlp": 1.04242051, + "epoch": 0.18083878414774912, + "flos": 1518572358144.0, + "grad_norm": 0.027679884047562747, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80827093, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.12304688, + "step": 940, + "time_per_iteration": 4.941875219345093 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090642, + "balance_loss_mlp": 1.05728722, + "epoch": 0.1810311658330127, + "flos": 727006703616.0, + "grad_norm": 0.07011009980003599, + "language_loss": 0.84326053, + "learning_rate": 0.000941372998698552, + "loss": 0.85416698, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.33374023, + "step": 941, + "time_per_iteration": 2.931520938873291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094264, + "balance_loss_mlp": 1.0597409, + "epoch": 0.18122354751827627, + "flos": 564643726848.0, + "grad_norm": 0.08254502738164117, + "language_loss": 0.8207435, + "learning_rate": 0.0009412265347159336, + "loss": 0.8316862, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.34570312, + "step": 942, + "time_per_iteration": 2.696354627609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091238, + "balance_loss_mlp": 1.05869377, + "epoch": 0.18141592920353983, + "flos": 519024208896.0, + "grad_norm": 0.05729066672306875, + "language_loss": 0.85217965, + "learning_rate": 0.0009410798994339829, + "loss": 0.86309201, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.32543945, + "step": 943, + "time_per_iteration": 2.6009600162506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088013, + "balance_loss_mlp": 1.0545156, + "epoch": 0.1816083108888034, + "flos": 512219261952.0, + "grad_norm": 0.05342615519744699, + "language_loss": 0.88234782, + "learning_rate": 0.000940933092909628, + "loss": 0.89322793, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.33520508, + "step": 944, + "time_per_iteration": 2.618419647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095526, + "balance_loss_mlp": 1.06286263, + "epoch": 0.18180069257406695, + "flos": 492144864768.0, + "grad_norm": 0.053227732023653135, + "language_loss": 0.8393383, + "learning_rate": 0.0009407861151998649, + "loss": 0.85029352, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.32666016, + "step": 945, + "time_per_iteration": 2.5718705654144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097774, + "balance_loss_mlp": 1.06406188, + "epoch": 0.1819930742593305, + "flos": 569891870208.0, + "grad_norm": 0.05775782434029923, + "language_loss": 0.86156505, + "learning_rate": 0.0009406389663617552, + "loss": 0.87254274, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.33740234, + "step": 946, + "time_per_iteration": 2.66513729095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097522, + "balance_loss_mlp": 1.06433463, + "epoch": 0.18218545594459407, + "flos": 605693168640.0, + "grad_norm": 0.06350431386522506, + "language_loss": 0.85736459, + "learning_rate": 0.000940491646452427, + "loss": 0.86833978, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.33203125, + "step": 947, + "time_per_iteration": 2.715071201324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103916, + "balance_loss_mlp": 1.07010818, + "epoch": 0.18237783762985763, + "flos": 548419230720.0, + "grad_norm": 0.06277969821047595, + "language_loss": 0.91195452, + "learning_rate": 0.000940344155529075, + "loss": 0.92299366, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.33837891, + "step": 948, + "time_per_iteration": 2.6502938270568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099574, + "balance_loss_mlp": 1.06550407, + "epoch": 0.1825702193151212, + "flos": 450509078016.0, + "grad_norm": 0.06933176029299125, + "language_loss": 0.87683523, + "learning_rate": 0.0009401964936489605, + "loss": 0.88783091, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.34106445, + "step": 949, + "time_per_iteration": 2.5181798934936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084135, + "balance_loss_mlp": 1.05247355, + "epoch": 0.18276260100038477, + "flos": 588962313216.0, + "grad_norm": 0.07980064544074586, + "language_loss": 0.85422635, + "learning_rate": 0.0009400486608694108, + "loss": 0.86506772, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.31640625, + "step": 950, + "time_per_iteration": 2.7189955711364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087871, + "balance_loss_mlp": 1.05384839, + "epoch": 0.18295498268564833, + "flos": 786988376064.0, + "grad_norm": 0.05265351460276348, + "language_loss": 0.87225658, + "learning_rate": 0.0009399006572478195, + "loss": 0.88313532, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.34033203, + "step": 951, + "time_per_iteration": 3.0805773735046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086089, + "balance_loss_mlp": 1.05218577, + "epoch": 0.1831473643709119, + "flos": 577878271488.0, + "grad_norm": 0.059447924131550096, + "language_loss": 0.91242015, + "learning_rate": 0.0009397524828416468, + "loss": 0.92328107, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.33935547, + "step": 952, + "time_per_iteration": 2.6567108631134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082155, + "balance_loss_mlp": 1.04801321, + "epoch": 0.18333974605617545, + "flos": 566622521856.0, + "grad_norm": 0.05513512337372911, + "language_loss": 0.96184212, + "learning_rate": 0.0009396041377084192, + "loss": 0.97266364, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.34179688, + "step": 953, + "time_per_iteration": 2.6937921047210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079403, + "balance_loss_mlp": 1.04478431, + "epoch": 0.183532127741439, + "flos": 526725421056.0, + "grad_norm": 0.07204875194033089, + "language_loss": 0.87840325, + "learning_rate": 0.0009394556219057295, + "loss": 0.88919723, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.34667969, + "step": 954, + "time_per_iteration": 2.6962215900421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107777, + "balance_loss_mlp": 1.04272258, + "epoch": 0.18372450942670257, + "flos": 594259918848.0, + "grad_norm": 0.07227161235955501, + "language_loss": 0.83883446, + "learning_rate": 0.0009393069354912362, + "loss": 0.84961212, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.35058594, + "step": 955, + "time_per_iteration": 2.718308925628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081248, + "balance_loss_mlp": 1.04677236, + "epoch": 0.18391689111196613, + "flos": 644720145408.0, + "grad_norm": 0.07091738302891186, + "language_loss": 0.82511717, + "learning_rate": 0.0009391580785226649, + "loss": 0.83592963, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.34521484, + "step": 956, + "time_per_iteration": 2.907367467880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077991, + "balance_loss_mlp": 1.06216049, + "epoch": 0.18410927279722972, + "flos": 1456246563840.0, + "grad_norm": 0.048423099914415325, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80418444, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.15820312, + "step": 957, + "time_per_iteration": 4.78663969039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091525, + "balance_loss_mlp": 1.05702567, + "epoch": 0.18430165448249328, + "flos": 658437728256.0, + "grad_norm": 0.09319397884021513, + "language_loss": 0.86484683, + "learning_rate": 0.0009388598531545196, + "loss": 0.8757621, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.34545898, + "step": 958, + "time_per_iteration": 2.8470118045806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087326, + "balance_loss_mlp": 1.05285025, + "epoch": 0.18449403616775684, + "flos": 517679811072.0, + "grad_norm": 0.07377492103556435, + "language_loss": 0.86076611, + "learning_rate": 0.000938710484870727, + "loss": 0.87163937, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.3449707, + "step": 959, + "time_per_iteration": 2.5930731296539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090986, + "balance_loss_mlp": 1.05672574, + "epoch": 0.1846864178530204, + "flos": 552481537536.0, + "grad_norm": 0.06589557505977534, + "language_loss": 0.86379164, + "learning_rate": 0.0009385609462644189, + "loss": 0.8747015, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.34277344, + "step": 960, + "time_per_iteration": 2.688706636428833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096456, + "balance_loss_mlp": 1.06212378, + "epoch": 0.18487879953828396, + "flos": 465930441216.0, + "grad_norm": 0.0643439417949763, + "language_loss": 0.86035949, + "learning_rate": 0.0009384112373936514, + "loss": 0.871324, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.34326172, + "step": 961, + "time_per_iteration": 4.0801496505737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095895, + "balance_loss_mlp": 1.06132412, + "epoch": 0.18507118122354752, + "flos": 648200489472.0, + "grad_norm": 0.0614591664996872, + "language_loss": 0.91820455, + "learning_rate": 0.0009382613583165467, + "loss": 0.92916346, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.34594727, + "step": 962, + "time_per_iteration": 2.790069341659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113098, + "balance_loss_mlp": 1.07921863, + "epoch": 0.18526356290881107, + "flos": 626486330880.0, + "grad_norm": 0.06374556186760763, + "language_loss": 0.89594233, + "learning_rate": 0.0009381113090912928, + "loss": 0.90707326, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.33886719, + "step": 963, + "time_per_iteration": 2.6891098022460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117951, + "balance_loss_mlp": 1.08559799, + "epoch": 0.18545594459407463, + "flos": 432497843712.0, + "grad_norm": 0.06491910119233056, + "language_loss": 0.90103394, + "learning_rate": 0.000937961089776144, + "loss": 0.91221344, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.32348633, + "step": 964, + "time_per_iteration": 2.5821962356567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124554, + "balance_loss_mlp": 1.08926833, + "epoch": 0.1856483262793382, + "flos": 748720862208.0, + "grad_norm": 0.06849062336391444, + "language_loss": 0.829036, + "learning_rate": 0.0009378107004294208, + "loss": 0.84028149, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.35302734, + "step": 965, + "time_per_iteration": 2.9898061752319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115255, + "balance_loss_mlp": 1.081972, + "epoch": 0.18584070796460178, + "flos": 530058788352.0, + "grad_norm": 0.08647217477609576, + "language_loss": 0.91352308, + "learning_rate": 0.0009376601411095096, + "loss": 0.92467564, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.33300781, + "step": 966, + "time_per_iteration": 2.6415059566497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093436, + "balance_loss_mlp": 1.06196475, + "epoch": 0.18603308964986534, + "flos": 482863527936.0, + "grad_norm": 0.05783783242438048, + "language_loss": 0.8708145, + "learning_rate": 0.0009375094118748622, + "loss": 0.88174886, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.31445312, + "step": 967, + "time_per_iteration": 2.5149550437927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089094, + "balance_loss_mlp": 1.05650234, + "epoch": 0.1862254713351289, + "flos": 800976591360.0, + "grad_norm": 0.0756042683078202, + "language_loss": 0.9083451, + "learning_rate": 0.0009373585127839976, + "loss": 0.91923606, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.32592773, + "step": 968, + "time_per_iteration": 2.9569485187530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084434, + "balance_loss_mlp": 1.05250978, + "epoch": 0.18641785302039246, + "flos": 478082456064.0, + "grad_norm": 0.06160067145414361, + "language_loss": 0.91074634, + "learning_rate": 0.0009372074438954994, + "loss": 0.92159069, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.3190918, + "step": 969, + "time_per_iteration": 2.508530378341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083542, + "balance_loss_mlp": 1.05040169, + "epoch": 0.18661023470565602, + "flos": 388695587328.0, + "grad_norm": 0.07517959095695621, + "language_loss": 0.91676056, + "learning_rate": 0.0009370562052680181, + "loss": 0.92759597, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.33154297, + "step": 970, + "time_per_iteration": 2.4572672843933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087332, + "balance_loss_mlp": 1.05400109, + "epoch": 0.18680261639091958, + "flos": 564402207744.0, + "grad_norm": 0.052448577146131624, + "language_loss": 0.89610398, + "learning_rate": 0.0009369047969602695, + "loss": 0.90697736, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.33349609, + "step": 971, + "time_per_iteration": 2.714704751968384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101064, + "balance_loss_mlp": 1.06556404, + "epoch": 0.18699499807618314, + "flos": 479018009088.0, + "grad_norm": 0.06595213007116614, + "language_loss": 0.8674072, + "learning_rate": 0.0009367532190310357, + "loss": 0.87841785, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.35498047, + "step": 972, + "time_per_iteration": 2.589667558670044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111914, + "balance_loss_mlp": 1.07660413, + "epoch": 0.1871873797614467, + "flos": 553022802432.0, + "grad_norm": 0.0720295199384638, + "language_loss": 0.88701892, + "learning_rate": 0.0009366014715391644, + "loss": 0.89813805, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.35327148, + "step": 973, + "time_per_iteration": 2.634023904800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107724, + "balance_loss_mlp": 1.07389259, + "epoch": 0.18737976144671029, + "flos": 552526617600.0, + "grad_norm": 0.05153911900793568, + "language_loss": 0.8432554, + "learning_rate": 0.0009364495545435693, + "loss": 0.85433269, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.33837891, + "step": 974, + "time_per_iteration": 2.7729458808898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107099, + "balance_loss_mlp": 1.07281494, + "epoch": 0.18757214313197385, + "flos": 502002372096.0, + "grad_norm": 0.05815108638233015, + "language_loss": 0.88620323, + "learning_rate": 0.0009362974681032297, + "loss": 0.8972742, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.34326172, + "step": 975, + "time_per_iteration": 2.631744623184204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105498, + "balance_loss_mlp": 1.07130909, + "epoch": 0.1877645248172374, + "flos": 674691337728.0, + "grad_norm": 0.06841603134690444, + "language_loss": 0.88265896, + "learning_rate": 0.0009361452122771907, + "loss": 0.89371395, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.34204102, + "step": 976, + "time_per_iteration": 2.8427281379699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094167, + "balance_loss_mlp": 1.06012082, + "epoch": 0.18795690650250096, + "flos": 404771696640.0, + "grad_norm": 0.07319435948671522, + "language_loss": 0.8377496, + "learning_rate": 0.0009359927871245635, + "loss": 0.84869128, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.34057617, + "step": 977, + "time_per_iteration": 2.4665186405181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090091, + "balance_loss_mlp": 1.0565697, + "epoch": 0.18814928818776452, + "flos": 637599485952.0, + "grad_norm": 0.05986452276683665, + "language_loss": 0.86337954, + "learning_rate": 0.0009358401927045246, + "loss": 0.87428045, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.33520508, + "step": 978, + "time_per_iteration": 2.8037781715393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090672, + "balance_loss_mlp": 1.05707908, + "epoch": 0.18834166987302808, + "flos": 1137825446400.0, + "grad_norm": 0.054509582003230646, + "language_loss": 0.88314402, + "learning_rate": 0.0009356874290763166, + "loss": 0.89405078, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.33618164, + "step": 979, + "time_per_iteration": 3.456723213195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097981, + "balance_loss_mlp": 1.06481671, + "epoch": 0.18853405155829164, + "flos": 504538398720.0, + "grad_norm": 0.06366920756378494, + "language_loss": 0.8866874, + "learning_rate": 0.0009355344962992474, + "loss": 0.89766723, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.33154297, + "step": 980, + "time_per_iteration": 2.6105339527130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109825, + "balance_loss_mlp": 1.06494308, + "epoch": 0.1887264332435552, + "flos": 607879987200.0, + "grad_norm": 0.05130215804193928, + "language_loss": 0.88147485, + "learning_rate": 0.0009353813944326908, + "loss": 0.89245737, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.33325195, + "step": 981, + "time_per_iteration": 2.882836103439331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109809, + "balance_loss_mlp": 1.0758822, + "epoch": 0.1889188149288188, + "flos": 552264749568.0, + "grad_norm": 0.07032712681879846, + "language_loss": 0.83146608, + "learning_rate": 0.0009352281235360863, + "loss": 0.84256417, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.33959961, + "step": 982, + "time_per_iteration": 2.695748805999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120418, + "balance_loss_mlp": 1.08775461, + "epoch": 0.18911119661408235, + "flos": 418332128256.0, + "grad_norm": 0.06033753714629359, + "language_loss": 0.84987485, + "learning_rate": 0.0009350746836689389, + "loss": 0.86107904, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.32666016, + "step": 983, + "time_per_iteration": 2.5073440074920654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260435, + "balance_loss_mlp": 1.23916793, + "epoch": 0.1893035782993459, + "flos": 1481141320704.0, + "grad_norm": 0.0731593378732656, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82699656, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.21289062, + "step": 984, + "time_per_iteration": 5.065609931945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133244, + "balance_loss_mlp": 1.09831583, + "epoch": 0.18949595998460947, + "flos": 508220974080.0, + "grad_norm": 0.09166419018528392, + "language_loss": 0.83211792, + "learning_rate": 0.0009347672972613634, + "loss": 0.84345031, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.34936523, + "step": 985, + "time_per_iteration": 2.580009937286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115864, + "balance_loss_mlp": 1.08270001, + "epoch": 0.18968834166987303, + "flos": 530812459008.0, + "grad_norm": 0.0668772854373454, + "language_loss": 0.85875785, + "learning_rate": 0.0009346133508402735, + "loss": 0.8699165, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.33178711, + "step": 986, + "time_per_iteration": 2.6872711181640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111031, + "balance_loss_mlp": 1.07724667, + "epoch": 0.1898807233551366, + "flos": 499515807744.0, + "grad_norm": 0.11088649382938841, + "language_loss": 0.8420769, + "learning_rate": 0.0009344592356873166, + "loss": 0.8531872, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.33813477, + "step": 987, + "time_per_iteration": 2.6347994804382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098007, + "balance_loss_mlp": 1.06462848, + "epoch": 0.19007310504040015, + "flos": 601936399872.0, + "grad_norm": 0.05765681888892058, + "language_loss": 0.78527796, + "learning_rate": 0.0009343049518623255, + "loss": 0.79625803, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.33398438, + "step": 988, + "time_per_iteration": 2.696929693222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082914, + "balance_loss_mlp": 1.05029869, + "epoch": 0.1902654867256637, + "flos": 601374786048.0, + "grad_norm": 0.05732720380572914, + "language_loss": 0.83250153, + "learning_rate": 0.0009341504994251985, + "loss": 0.84333068, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.32617188, + "step": 989, + "time_per_iteration": 2.8399016857147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095582, + "balance_loss_mlp": 1.07841623, + "epoch": 0.19045786841092727, + "flos": 1574925147648.0, + "grad_norm": 0.03888561388969961, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74616081, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.171875, + "step": 990, + "time_per_iteration": 5.072636842727661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109747, + "balance_loss_mlp": 1.06394839, + "epoch": 0.19065025009619085, + "flos": 681280906752.0, + "grad_norm": 0.135211113906906, + "language_loss": 0.818295, + "learning_rate": 0.0009338410889544574, + "loss": 0.82926977, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.33544922, + "step": 991, + "time_per_iteration": 3.050665855407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101867, + "balance_loss_mlp": 1.06786811, + "epoch": 0.1908426317814544, + "flos": 601971305472.0, + "grad_norm": 0.06286082016671143, + "language_loss": 0.87738663, + "learning_rate": 0.000933686131040967, + "loss": 0.88840532, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.34033203, + "step": 992, + "time_per_iteration": 2.7589659690856934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089273, + "balance_loss_mlp": 1.05672884, + "epoch": 0.19103501346671797, + "flos": 586027616256.0, + "grad_norm": 0.0561482479745879, + "language_loss": 0.90427077, + "learning_rate": 0.0009335310047555883, + "loss": 0.91516346, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.32543945, + "step": 993, + "time_per_iteration": 2.7133467197418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108709, + "balance_loss_mlp": 1.0532825, + "epoch": 0.19122739515198153, + "flos": 545494708224.0, + "grad_norm": 0.06221036652136981, + "language_loss": 0.88114065, + "learning_rate": 0.0009333757101585467, + "loss": 0.89201152, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.33837891, + "step": 994, + "time_per_iteration": 2.6733241081237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083527, + "balance_loss_mlp": 1.05105424, + "epoch": 0.1914197768372451, + "flos": 521171739648.0, + "grad_norm": 0.05606370206634765, + "language_loss": 0.93617988, + "learning_rate": 0.0009332202473101329, + "loss": 0.94701517, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.32470703, + "step": 995, + "time_per_iteration": 2.6689558029174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079312, + "balance_loss_mlp": 1.04536152, + "epoch": 0.19161215852250865, + "flos": 610961660928.0, + "grad_norm": 0.05986652691328414, + "language_loss": 0.83121806, + "learning_rate": 0.0009330646162707028, + "loss": 0.84201121, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.33984375, + "step": 996, + "time_per_iteration": 2.7264511585235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081823, + "balance_loss_mlp": 1.04849207, + "epoch": 0.1918045402077722, + "flos": 846281806848.0, + "grad_norm": 0.05485586532204223, + "language_loss": 0.84800065, + "learning_rate": 0.0009329088171006779, + "loss": 0.85881883, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.33349609, + "step": 997, + "time_per_iteration": 3.1486315727233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097705, + "balance_loss_mlp": 1.06220424, + "epoch": 0.19199692189303577, + "flos": 465699096576.0, + "grad_norm": 0.06540772430376247, + "language_loss": 0.84963006, + "learning_rate": 0.0009327528498605446, + "loss": 0.86060709, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.35522461, + "step": 998, + "time_per_iteration": 2.532460927963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087828, + "balance_loss_mlp": 1.0542109, + "epoch": 0.19218930357829936, + "flos": 531318818304.0, + "grad_norm": 0.06065225266474605, + "language_loss": 0.89716202, + "learning_rate": 0.0009325967146108548, + "loss": 0.90804029, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.33642578, + "step": 999, + "time_per_iteration": 2.6381072998046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108792, + "balance_loss_mlp": 1.05334902, + "epoch": 0.19238168526356292, + "flos": 601350054912.0, + "grad_norm": 0.06318510310852068, + "language_loss": 0.87984866, + "learning_rate": 0.0009324404114122258, + "loss": 0.89072788, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.34594727, + "step": 1000, + "time_per_iteration": 2.7017252445220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088105, + "balance_loss_mlp": 1.0544883, + "epoch": 0.19257406694882648, + "flos": 571690192896.0, + "grad_norm": 0.05361295189234855, + "language_loss": 0.87132722, + "learning_rate": 0.0009322839403253397, + "loss": 0.88220823, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.33642578, + "step": 1001, + "time_per_iteration": 2.7725350856781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091645, + "balance_loss_mlp": 1.05759907, + "epoch": 0.19276644863409004, + "flos": 801478568448.0, + "grad_norm": 0.0661765462165054, + "language_loss": 0.84038174, + "learning_rate": 0.0009321273014109439, + "loss": 0.85129815, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.34082031, + "step": 1002, + "time_per_iteration": 2.9275383949279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089997, + "balance_loss_mlp": 1.05676103, + "epoch": 0.1929588303193536, + "flos": 563024314368.0, + "grad_norm": 0.05133430998282463, + "language_loss": 0.85232604, + "learning_rate": 0.0009319704947298513, + "loss": 0.863226, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.33251953, + "step": 1003, + "time_per_iteration": 2.9198272228240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083204, + "balance_loss_mlp": 1.05120838, + "epoch": 0.19315121200461716, + "flos": 626550349824.0, + "grad_norm": 0.04652496586479965, + "language_loss": 0.88737059, + "learning_rate": 0.0009318135203429393, + "loss": 0.8982026, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.31982422, + "step": 1004, + "time_per_iteration": 2.7145965099334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094807, + "balance_loss_mlp": 1.06116605, + "epoch": 0.19334359368988072, + "flos": 517169069568.0, + "grad_norm": 0.06711221272981459, + "language_loss": 0.88228458, + "learning_rate": 0.0009316563783111511, + "loss": 0.8932327, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.33642578, + "step": 1005, + "time_per_iteration": 2.68135404586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095243, + "balance_loss_mlp": 1.06050563, + "epoch": 0.19353597537514428, + "flos": 693751606272.0, + "grad_norm": 0.04947727679523619, + "language_loss": 0.82323831, + "learning_rate": 0.0009314990686954943, + "loss": 0.83419079, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.34765625, + "step": 1006, + "time_per_iteration": 2.9068872928619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098932, + "balance_loss_mlp": 1.06495738, + "epoch": 0.19372835706040784, + "flos": 1209665180160.0, + "grad_norm": 0.05336104081377929, + "language_loss": 0.80917025, + "learning_rate": 0.000931341591557042, + "loss": 0.82015955, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.34008789, + "step": 1007, + "time_per_iteration": 3.759119749069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098415, + "balance_loss_mlp": 1.06291509, + "epoch": 0.19392073874567142, + "flos": 520368606720.0, + "grad_norm": 0.06549831272650784, + "language_loss": 0.87757689, + "learning_rate": 0.0009311839469569325, + "loss": 0.88856107, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.35522461, + "step": 1008, + "time_per_iteration": 2.6298930644989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100893, + "balance_loss_mlp": 1.06620264, + "epoch": 0.19411312043093498, + "flos": 588543293952.0, + "grad_norm": 0.06763315162421418, + "language_loss": 0.8732397, + "learning_rate": 0.0009310261349563687, + "loss": 0.88424855, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.34692383, + "step": 1009, + "time_per_iteration": 2.6843061447143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110066, + "balance_loss_mlp": 1.06718588, + "epoch": 0.19430550211619854, + "flos": 579085867008.0, + "grad_norm": 0.05371296475785438, + "language_loss": 0.8534441, + "learning_rate": 0.0009308681556166186, + "loss": 0.86445075, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.33496094, + "step": 1010, + "time_per_iteration": 2.8197336196899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107606, + "balance_loss_mlp": 1.07291579, + "epoch": 0.1944978838014621, + "flos": 620848281600.0, + "grad_norm": 0.08312668477716535, + "language_loss": 0.87206143, + "learning_rate": 0.0009307100089990152, + "loss": 0.88313752, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.34716797, + "step": 1011, + "time_per_iteration": 2.7118990421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101924, + "balance_loss_mlp": 1.0672822, + "epoch": 0.19469026548672566, + "flos": 598440089088.0, + "grad_norm": 0.061832865854500894, + "language_loss": 0.83946323, + "learning_rate": 0.0009305516951649568, + "loss": 0.85048252, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.34667969, + "step": 1012, + "time_per_iteration": 2.667672872543335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096764, + "balance_loss_mlp": 1.06314659, + "epoch": 0.19488264717198922, + "flos": 551890810368.0, + "grad_norm": 0.04827143175142062, + "language_loss": 0.87187612, + "learning_rate": 0.0009303932141759057, + "loss": 0.88284373, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.33642578, + "step": 1013, + "time_per_iteration": 2.7321088314056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101502, + "balance_loss_mlp": 1.06705046, + "epoch": 0.19507502885725278, + "flos": 665842166784.0, + "grad_norm": 0.05715794205563071, + "language_loss": 0.84201366, + "learning_rate": 0.0009302345660933902, + "loss": 0.85302866, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.3449707, + "step": 1014, + "time_per_iteration": 2.7699263095855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109904, + "balance_loss_mlp": 1.07616735, + "epoch": 0.19526741054251634, + "flos": 670771625472.0, + "grad_norm": 0.05949834877265084, + "language_loss": 0.84866655, + "learning_rate": 0.0009300757509790026, + "loss": 0.85976553, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.33764648, + "step": 1015, + "time_per_iteration": 2.8250515460968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110813, + "balance_loss_mlp": 1.0766474, + "epoch": 0.19545979222777993, + "flos": 446983653888.0, + "grad_norm": 0.0671511226198219, + "language_loss": 0.90974069, + "learning_rate": 0.0009299167688944005, + "loss": 0.92084885, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.34204102, + "step": 1016, + "time_per_iteration": 2.545133590698242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111157, + "balance_loss_mlp": 1.07778645, + "epoch": 0.1956521739130435, + "flos": 568813722624.0, + "grad_norm": 0.06338586690579641, + "language_loss": 0.85958129, + "learning_rate": 0.0009297576199013063, + "loss": 0.87069696, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.33813477, + "step": 1017, + "time_per_iteration": 2.668503761291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148218, + "balance_loss_mlp": 1.13295972, + "epoch": 0.19584455559830705, + "flos": 1454969157120.0, + "grad_norm": 0.047651466398381144, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74150348, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.15234375, + "step": 1018, + "time_per_iteration": 4.920944929122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104842, + "balance_loss_mlp": 1.09015501, + "epoch": 0.1960369372835706, + "flos": 1590320369664.0, + "grad_norm": 0.036993279908541045, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80531144, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.14648438, + "step": 1019, + "time_per_iteration": 6.0059425830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118502, + "balance_loss_mlp": 1.08505166, + "epoch": 0.19622931896883417, + "flos": 615709237248.0, + "grad_norm": 0.05240041234704895, + "language_loss": 0.86600977, + "learning_rate": 0.0009292791720892659, + "loss": 0.87719476, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.3347168, + "step": 1020, + "time_per_iteration": 2.995192527770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113873, + "balance_loss_mlp": 1.07930255, + "epoch": 0.19642170065409773, + "flos": 465950790144.0, + "grad_norm": 0.0657036282835547, + "language_loss": 0.88724279, + "learning_rate": 0.0009291193560807218, + "loss": 0.89838147, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.34594727, + "step": 1021, + "time_per_iteration": 2.633256196975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114293, + "balance_loss_mlp": 1.07962656, + "epoch": 0.19661408233936128, + "flos": 515040477696.0, + "grad_norm": 0.054836200403870924, + "language_loss": 0.87439638, + "learning_rate": 0.0009289593734732688, + "loss": 0.88553929, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.34716797, + "step": 1022, + "time_per_iteration": 2.622284173965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107262, + "balance_loss_mlp": 1.0736922, + "epoch": 0.19680646402462484, + "flos": 392427624960.0, + "grad_norm": 0.053036961045345866, + "language_loss": 0.94139373, + "learning_rate": 0.0009287992243290175, + "loss": 0.95246631, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.3359375, + "step": 1023, + "time_per_iteration": 2.4402668476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108975, + "balance_loss_mlp": 1.07247353, + "epoch": 0.19699884570988843, + "flos": 626122566144.0, + "grad_norm": 0.056904835680118435, + "language_loss": 0.90850759, + "learning_rate": 0.0009286389087101435, + "loss": 0.91959733, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.36523438, + "step": 1024, + "time_per_iteration": 2.762068271636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110519, + "balance_loss_mlp": 1.06957078, + "epoch": 0.197191227395152, + "flos": 557710742016.0, + "grad_norm": 0.05298833269370499, + "language_loss": 0.88575542, + "learning_rate": 0.0009284784266788864, + "loss": 0.89680731, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.35668945, + "step": 1025, + "time_per_iteration": 4.087035417556763 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109606, + "balance_loss_mlp": 1.07565546, + "epoch": 0.19738360908041555, + "flos": 664681061376.0, + "grad_norm": 0.0565537913278748, + "language_loss": 0.92494339, + "learning_rate": 0.0009283177782975512, + "loss": 0.93603945, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.33984375, + "step": 1026, + "time_per_iteration": 2.948167562484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095626, + "balance_loss_mlp": 1.06117415, + "epoch": 0.1975759907656791, + "flos": 522244094976.0, + "grad_norm": 0.06218898027866582, + "language_loss": 0.88052273, + "learning_rate": 0.000928156963628507, + "loss": 0.89147896, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.3449707, + "step": 1027, + "time_per_iteration": 2.564019203186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091019, + "balance_loss_mlp": 1.05694866, + "epoch": 0.19776837245094267, + "flos": 462233309184.0, + "grad_norm": 0.056114928823487176, + "language_loss": 0.8826099, + "learning_rate": 0.0009279959827341877, + "loss": 0.89352006, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.34082031, + "step": 1028, + "time_per_iteration": 2.7226340770721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090946, + "balance_loss_mlp": 1.05699515, + "epoch": 0.19796075413620623, + "flos": 502809887232.0, + "grad_norm": 0.05507551359640612, + "language_loss": 0.88204837, + "learning_rate": 0.0009278348356770915, + "loss": 0.89295781, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.33984375, + "step": 1029, + "time_per_iteration": 2.592756748199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085455, + "balance_loss_mlp": 1.05093157, + "epoch": 0.1981531358214698, + "flos": 507281038848.0, + "grad_norm": 0.061172366255401664, + "language_loss": 0.85939109, + "learning_rate": 0.0009276735225197814, + "loss": 0.87024558, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.34570312, + "step": 1030, + "time_per_iteration": 2.598607063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088832, + "balance_loss_mlp": 1.05495238, + "epoch": 0.19834551750673335, + "flos": 531275148288.0, + "grad_norm": 0.0802549423316463, + "language_loss": 0.86293721, + "learning_rate": 0.0009275120433248847, + "loss": 0.87382561, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.33886719, + "step": 1031, + "time_per_iteration": 2.7143311500549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090216, + "balance_loss_mlp": 1.05726683, + "epoch": 0.1985378991919969, + "flos": 775147691520.0, + "grad_norm": 0.05308511447166053, + "language_loss": 0.86272347, + "learning_rate": 0.0009273503981550931, + "loss": 0.87362564, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.32958984, + "step": 1032, + "time_per_iteration": 3.0616648197174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087082, + "balance_loss_mlp": 1.05351269, + "epoch": 0.1987302808772605, + "flos": 434063411712.0, + "grad_norm": 0.059916166081832097, + "language_loss": 0.8703599, + "learning_rate": 0.0009271885870731626, + "loss": 0.88123071, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.3359375, + "step": 1033, + "time_per_iteration": 2.487316131591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092715, + "balance_loss_mlp": 1.05921745, + "epoch": 0.19892266256252406, + "flos": 553342897152.0, + "grad_norm": 0.06168947094446192, + "language_loss": 0.88599998, + "learning_rate": 0.0009270266101419143, + "loss": 0.89692712, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.33520508, + "step": 1034, + "time_per_iteration": 2.5978119373321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085912, + "balance_loss_mlp": 1.05403578, + "epoch": 0.19911504424778761, + "flos": 549596302848.0, + "grad_norm": 0.06019117447906982, + "language_loss": 0.85564321, + "learning_rate": 0.0009268644674242328, + "loss": 0.86650234, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.31860352, + "step": 1035, + "time_per_iteration": 2.7259163856506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097892, + "balance_loss_mlp": 1.0645138, + "epoch": 0.19930742593305117, + "flos": 518024636928.0, + "grad_norm": 0.05869793462101787, + "language_loss": 0.81141233, + "learning_rate": 0.0009267021589830678, + "loss": 0.82239127, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.33398438, + "step": 1036, + "time_per_iteration": 2.597724199295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161292, + "balance_loss_mlp": 1.14507985, + "epoch": 0.19949980761831473, + "flos": 1508516849664.0, + "grad_norm": 0.04621309141147155, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78788376, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.16210938, + "step": 1037, + "time_per_iteration": 4.918612241744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093993, + "balance_loss_mlp": 1.06044722, + "epoch": 0.1996921893035783, + "flos": 697803738624.0, + "grad_norm": 0.061892224045152405, + "language_loss": 0.93283784, + "learning_rate": 0.000926377045182406, + "loss": 0.94377768, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.33569336, + "step": 1038, + "time_per_iteration": 2.8800160884857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096412, + "balance_loss_mlp": 1.06334293, + "epoch": 0.19988457098884185, + "flos": 726682226688.0, + "grad_norm": 0.0613562398808313, + "language_loss": 0.87972045, + "learning_rate": 0.0009262142399491296, + "loss": 0.89068449, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.33081055, + "step": 1039, + "time_per_iteration": 3.0561435222625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097284, + "balance_loss_mlp": 1.06345224, + "epoch": 0.2000769526741054, + "flos": 560275881984.0, + "grad_norm": 0.06364175085873486, + "language_loss": 0.87837642, + "learning_rate": 0.0009260512692448105, + "loss": 0.88934934, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.33862305, + "step": 1040, + "time_per_iteration": 2.7037088871002197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090351, + "balance_loss_mlp": 1.05697203, + "epoch": 0.200269334359369, + "flos": 571758594048.0, + "grad_norm": 0.05851279903795688, + "language_loss": 0.84325236, + "learning_rate": 0.000925888133132719, + "loss": 0.85415584, + "num_input_tokens_seen": 86289824, + "router_z_loss_mlp": 0.33398438, + "step": 1041, + "time_per_iteration": 2.836177110671997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082089, + "balance_loss_mlp": 1.06730711, + "epoch": 0.20046171604463256, + "flos": 1485362340864.0, + "grad_norm": 0.029405325300647274, + "language_loss": 0.79610431, + "learning_rate": 0.0009257248316761906, + "loss": 0.80692518, + "num_input_tokens_seen": 86516384, + "router_z_loss_mlp": 0.14746094, + "step": 1042, + "time_per_iteration": 4.901337146759033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094947, + "balance_loss_mlp": 1.06140149, + "epoch": 0.20065409772989612, + "flos": 496266808320.0, + "grad_norm": 0.07205728359427886, + "language_loss": 0.81256473, + "learning_rate": 0.0009255613649386244, + "loss": 0.82351422, + "num_input_tokens_seen": 86587296, + "router_z_loss_mlp": 0.33544922, + "step": 1043, + "time_per_iteration": 2.6198885440826416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089381, + "balance_loss_mlp": 1.05686069, + "epoch": 0.20084647941515968, + "flos": 579094631424.0, + "grad_norm": 0.06625931968059934, + "language_loss": 0.79017001, + "learning_rate": 0.0009253977329834838, + "loss": 0.80106384, + "num_input_tokens_seen": 86662656, + "router_z_loss_mlp": 0.32519531, + "step": 1044, + "time_per_iteration": 2.6872498989105225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111234, + "balance_loss_mlp": 1.07745981, + "epoch": 0.20103886110042324, + "flos": 641775273984.0, + "grad_norm": 0.06628294367657735, + "language_loss": 0.86666185, + "learning_rate": 0.0009252339358742965, + "loss": 0.87778521, + "num_input_tokens_seen": 86734704, + "router_z_loss_mlp": 0.34912109, + "step": 1045, + "time_per_iteration": 2.7749996185302734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116952, + "balance_loss_mlp": 1.08219087, + "epoch": 0.2012312427856868, + "flos": 441720953856.0, + "grad_norm": 0.05401214919341486, + "language_loss": 0.83449644, + "learning_rate": 0.000925069973674654, + "loss": 0.84566593, + "num_input_tokens_seen": 86806512, + "router_z_loss_mlp": 0.34814453, + "step": 1046, + "time_per_iteration": 2.662992477416992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114258, + "balance_loss_mlp": 1.08116508, + "epoch": 0.20142362447095036, + "flos": 554135855616.0, + "grad_norm": 0.049297877184233195, + "language_loss": 0.88960069, + "learning_rate": 0.000924905846448212, + "loss": 0.90074325, + "num_input_tokens_seen": 86883440, + "router_z_loss_mlp": 0.33105469, + "step": 1047, + "time_per_iteration": 2.8164925575256348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127137, + "balance_loss_mlp": 1.09306693, + "epoch": 0.20161600615621392, + "flos": 669988841472.0, + "grad_norm": 0.07365230282100185, + "language_loss": 0.85615861, + "learning_rate": 0.0009247415542586906, + "loss": 0.86742997, + "num_input_tokens_seen": 86960208, + "router_z_loss_mlp": 0.34106445, + "step": 1048, + "time_per_iteration": 2.858611822128296 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115399, + "balance_loss_mlp": 1.08130527, + "epoch": 0.2018083878414775, + "flos": 572788689408.0, + "grad_norm": 0.05223287600750505, + "language_loss": 0.83514655, + "learning_rate": 0.0009245770971698735, + "loss": 0.84630048, + "num_input_tokens_seen": 87044144, + "router_z_loss_mlp": 0.34106445, + "step": 1049, + "time_per_iteration": 2.8758163452148438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103626, + "balance_loss_mlp": 1.07036686, + "epoch": 0.20200076952674106, + "flos": 425624495616.0, + "grad_norm": 0.061140118103518055, + "language_loss": 0.88792473, + "learning_rate": 0.0009244124752456087, + "loss": 0.89896095, + "num_input_tokens_seen": 87109136, + "router_z_loss_mlp": 0.33276367, + "step": 1050, + "time_per_iteration": 2.501565456390381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097765, + "balance_loss_mlp": 1.06457758, + "epoch": 0.20219315121200462, + "flos": 536326852608.0, + "grad_norm": 0.049507299183714965, + "language_loss": 0.85344577, + "learning_rate": 0.0009242476885498081, + "loss": 0.86442339, + "num_input_tokens_seen": 87184320, + "router_z_loss_mlp": 0.33203125, + "step": 1051, + "time_per_iteration": 2.698791027069092 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095962, + "balance_loss_mlp": 1.06222594, + "epoch": 0.20238553289726818, + "flos": 477634323456.0, + "grad_norm": 0.07140169421024865, + "language_loss": 0.8134433, + "learning_rate": 0.0009240827371464474, + "loss": 0.82440293, + "num_input_tokens_seen": 87248224, + "router_z_loss_mlp": 0.33764648, + "step": 1052, + "time_per_iteration": 2.5603079795837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082157, + "balance_loss_mlp": 1.04958868, + "epoch": 0.20257791458253174, + "flos": 1151611430400.0, + "grad_norm": 0.06069279327125781, + "language_loss": 0.84372044, + "learning_rate": 0.0009239176210995666, + "loss": 0.85454196, + "num_input_tokens_seen": 87333088, + "router_z_loss_mlp": 0.32568359, + "step": 1053, + "time_per_iteration": 3.4549684524536133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088056, + "balance_loss_mlp": 1.05393791, + "epoch": 0.2027702962677953, + "flos": 666606011904.0, + "grad_norm": 0.06066867592012189, + "language_loss": 0.93657684, + "learning_rate": 0.0009237523404732695, + "loss": 0.94745743, + "num_input_tokens_seen": 87413840, + "router_z_loss_mlp": 0.34130859, + "step": 1054, + "time_per_iteration": 4.344247817993164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078384, + "balance_loss_mlp": 1.04567289, + "epoch": 0.20296267795305886, + "flos": 641011428864.0, + "grad_norm": 0.0678922331878557, + "language_loss": 0.84289086, + "learning_rate": 0.0009235868953317235, + "loss": 0.85367465, + "num_input_tokens_seen": 87487168, + "router_z_loss_mlp": 0.32714844, + "step": 1055, + "time_per_iteration": 2.7755184173583984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086857, + "balance_loss_mlp": 1.05321646, + "epoch": 0.20315505963832242, + "flos": 930187777536.0, + "grad_norm": 0.06816541670806936, + "language_loss": 0.85603452, + "learning_rate": 0.0009234212857391602, + "loss": 0.86690307, + "num_input_tokens_seen": 87573184, + "router_z_loss_mlp": 0.33642578, + "step": 1056, + "time_per_iteration": 3.1736087799072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089034, + "balance_loss_mlp": 1.05477369, + "epoch": 0.20334744132358598, + "flos": 561818128896.0, + "grad_norm": 0.05209348313890264, + "language_loss": 0.88978589, + "learning_rate": 0.000923255511759875, + "loss": 0.90067613, + "num_input_tokens_seen": 87651968, + "router_z_loss_mlp": 0.34301758, + "step": 1057, + "time_per_iteration": 2.7823617458343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093097, + "balance_loss_mlp": 1.05945587, + "epoch": 0.20353982300884957, + "flos": 643902455808.0, + "grad_norm": 0.061337083912670884, + "language_loss": 0.85219932, + "learning_rate": 0.000923089573458227, + "loss": 0.86313027, + "num_input_tokens_seen": 87727792, + "router_z_loss_mlp": 0.33666992, + "step": 1058, + "time_per_iteration": 2.8398988246917725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092738, + "balance_loss_mlp": 1.05952644, + "epoch": 0.20373220469411313, + "flos": 651101690880.0, + "grad_norm": 0.0713114334987562, + "language_loss": 0.84425724, + "learning_rate": 0.0009229234708986392, + "loss": 0.85518456, + "num_input_tokens_seen": 87806048, + "router_z_loss_mlp": 0.33203125, + "step": 1059, + "time_per_iteration": 2.891934394836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069957, + "balance_loss_mlp": 1.05603302, + "epoch": 0.2039245863793767, + "flos": 1436939136000.0, + "grad_norm": 0.037855568460977755, + "language_loss": 0.81666899, + "learning_rate": 0.0009227572041455982, + "loss": 0.8273685, + "num_input_tokens_seen": 88018160, + "router_z_loss_mlp": 0.13964844, + "step": 1060, + "time_per_iteration": 4.673142194747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092929, + "balance_loss_mlp": 1.05985999, + "epoch": 0.20411696806464025, + "flos": 596678082048.0, + "grad_norm": 0.07190006801568614, + "language_loss": 0.85404283, + "learning_rate": 0.0009225907732636548, + "loss": 0.86497211, + "num_input_tokens_seen": 88090864, + "router_z_loss_mlp": 0.33081055, + "step": 1061, + "time_per_iteration": 2.74110746383667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091714, + "balance_loss_mlp": 1.0585742, + "epoch": 0.2043093497499038, + "flos": 573530775552.0, + "grad_norm": 0.06271161302412134, + "language_loss": 0.86991799, + "learning_rate": 0.0009224241783174227, + "loss": 0.88083506, + "num_input_tokens_seen": 88161360, + "router_z_loss_mlp": 0.33154297, + "step": 1062, + "time_per_iteration": 2.6885697841644287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084619, + "balance_loss_mlp": 1.05233693, + "epoch": 0.20450173143516737, + "flos": 630061217280.0, + "grad_norm": 0.055816021094363524, + "language_loss": 0.85842204, + "learning_rate": 0.0009222574193715802, + "loss": 0.86926818, + "num_input_tokens_seen": 88234960, + "router_z_loss_mlp": 0.32275391, + "step": 1063, + "time_per_iteration": 2.7569899559020996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087796, + "balance_loss_mlp": 1.05522823, + "epoch": 0.20469411312043093, + "flos": 573718450176.0, + "grad_norm": 0.051897822989382614, + "language_loss": 0.8621105, + "learning_rate": 0.000922090496490869, + "loss": 0.87298846, + "num_input_tokens_seen": 88308176, + "router_z_loss_mlp": 0.32568359, + "step": 1064, + "time_per_iteration": 2.7099597454071045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082485, + "balance_loss_mlp": 1.05025065, + "epoch": 0.20488649480569449, + "flos": 636748300800.0, + "grad_norm": 0.04962250787968228, + "language_loss": 0.90165728, + "learning_rate": 0.0009219234097400937, + "loss": 0.91248214, + "num_input_tokens_seen": 88386768, + "router_z_loss_mlp": 0.32226562, + "step": 1065, + "time_per_iteration": 2.8492319583892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108042, + "balance_loss_mlp": 1.04806709, + "epoch": 0.20507887649095807, + "flos": 975383894016.0, + "grad_norm": 0.051536979552593745, + "language_loss": 0.83029723, + "learning_rate": 0.0009217561591841237, + "loss": 0.84110147, + "num_input_tokens_seen": 88476576, + "router_z_loss_mlp": 0.32348633, + "step": 1066, + "time_per_iteration": 3.267207145690918 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082085, + "balance_loss_mlp": 1.04951739, + "epoch": 0.20527125817622163, + "flos": 485940819456.0, + "grad_norm": 0.09661652793466288, + "language_loss": 0.81334901, + "learning_rate": 0.0009215887448878913, + "loss": 0.82416987, + "num_input_tokens_seen": 88541968, + "router_z_loss_mlp": 0.32568359, + "step": 1067, + "time_per_iteration": 2.5429391860961914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088998, + "balance_loss_mlp": 1.05552411, + "epoch": 0.2054636398614852, + "flos": 526921860096.0, + "grad_norm": 0.09953641970782799, + "language_loss": 0.85144234, + "learning_rate": 0.0009214211669163922, + "loss": 0.86233234, + "num_input_tokens_seen": 88615296, + "router_z_loss_mlp": 0.33496094, + "step": 1068, + "time_per_iteration": 2.7006540298461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085875, + "balance_loss_mlp": 1.05428481, + "epoch": 0.20565602154674875, + "flos": 557898416640.0, + "grad_norm": 0.048729379907622286, + "language_loss": 0.93896133, + "learning_rate": 0.0009212534253346862, + "loss": 0.94982004, + "num_input_tokens_seen": 88691584, + "router_z_loss_mlp": 0.31567383, + "step": 1069, + "time_per_iteration": 2.7544496059417725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098993, + "balance_loss_mlp": 1.06713986, + "epoch": 0.2058484032320123, + "flos": 503976784896.0, + "grad_norm": 0.06649355865978995, + "language_loss": 0.8497259, + "learning_rate": 0.0009210855202078964, + "loss": 0.86071587, + "num_input_tokens_seen": 88756592, + "router_z_loss_mlp": 0.31835938, + "step": 1070, + "time_per_iteration": 2.59660005569458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113348, + "balance_loss_mlp": 1.07975471, + "epoch": 0.20604078491727587, + "flos": 432950358528.0, + "grad_norm": 0.06315152856471482, + "language_loss": 0.87476587, + "learning_rate": 0.0009209174516012091, + "loss": 0.88589936, + "num_input_tokens_seen": 88820928, + "router_z_loss_mlp": 0.3359375, + "step": 1071, + "time_per_iteration": 2.498087167739868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110052, + "balance_loss_mlp": 1.07624412, + "epoch": 0.20623316660253943, + "flos": 608421252096.0, + "grad_norm": 0.06211591839104366, + "language_loss": 0.89244497, + "learning_rate": 0.0009207492195798747, + "loss": 0.9035455, + "num_input_tokens_seen": 88895440, + "router_z_loss_mlp": 0.33837891, + "step": 1072, + "time_per_iteration": 2.760019063949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116237, + "balance_loss_mlp": 1.08142757, + "epoch": 0.206425548287803, + "flos": 480184906752.0, + "grad_norm": 0.07379229384440758, + "language_loss": 0.84887302, + "learning_rate": 0.0009205808242092061, + "loss": 0.86003542, + "num_input_tokens_seen": 88964400, + "router_z_loss_mlp": 0.34838867, + "step": 1073, + "time_per_iteration": 2.6034529209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118683, + "balance_loss_mlp": 1.08423102, + "epoch": 0.20661792997306658, + "flos": 949007937024.0, + "grad_norm": 0.0763588165275792, + "language_loss": 0.82845032, + "learning_rate": 0.0009204122655545808, + "loss": 0.83963716, + "num_input_tokens_seen": 89049600, + "router_z_loss_mlp": 0.34472656, + "step": 1074, + "time_per_iteration": 3.3222029209136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111187, + "balance_loss_mlp": 1.07759392, + "epoch": 0.20681031165833014, + "flos": 603206604288.0, + "grad_norm": 0.05592396046249817, + "language_loss": 0.80705297, + "learning_rate": 0.0009202435436814388, + "loss": 0.81816483, + "num_input_tokens_seen": 89119024, + "router_z_loss_mlp": 0.33618164, + "step": 1075, + "time_per_iteration": 2.721888780593872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114903, + "balance_loss_mlp": 1.08121455, + "epoch": 0.2070026933435937, + "flos": 708665200128.0, + "grad_norm": 0.07630450069092473, + "language_loss": 0.89700603, + "learning_rate": 0.0009200746586552836, + "loss": 0.90815508, + "num_input_tokens_seen": 89197344, + "router_z_loss_mlp": 0.3371582, + "step": 1076, + "time_per_iteration": 2.8797900676727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107662, + "balance_loss_mlp": 1.07416463, + "epoch": 0.20719507502885726, + "flos": 829456409088.0, + "grad_norm": 0.06176881640488279, + "language_loss": 0.84210765, + "learning_rate": 0.0009199056105416825, + "loss": 0.85318428, + "num_input_tokens_seen": 89280464, + "router_z_loss_mlp": 0.33520508, + "step": 1077, + "time_per_iteration": 3.120950698852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107166, + "balance_loss_mlp": 1.07312012, + "epoch": 0.20738745671412082, + "flos": 637993774080.0, + "grad_norm": 0.055893649084458805, + "language_loss": 0.86594802, + "learning_rate": 0.0009197363994062654, + "loss": 0.87701964, + "num_input_tokens_seen": 89353344, + "router_z_loss_mlp": 0.34057617, + "step": 1078, + "time_per_iteration": 2.8197755813598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086727, + "balance_loss_mlp": 1.05480289, + "epoch": 0.20757983839938438, + "flos": 685258845696.0, + "grad_norm": 0.054433441748304986, + "language_loss": 0.84861732, + "learning_rate": 0.0009195670253147262, + "loss": 0.85948461, + "num_input_tokens_seen": 89439328, + "router_z_loss_mlp": 0.3190918, + "step": 1079, + "time_per_iteration": 2.966987133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096354, + "balance_loss_mlp": 1.06340468, + "epoch": 0.20777222008464794, + "flos": 519024208896.0, + "grad_norm": 0.07868801214896702, + "language_loss": 0.82301188, + "learning_rate": 0.0009193974883328216, + "loss": 0.83397532, + "num_input_tokens_seen": 89510160, + "router_z_loss_mlp": 0.32958984, + "step": 1080, + "time_per_iteration": 2.620704174041748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091209, + "balance_loss_mlp": 1.05725837, + "epoch": 0.2079646017699115, + "flos": 511136732160.0, + "grad_norm": 0.09961486538628272, + "language_loss": 0.87482947, + "learning_rate": 0.0009192277885263718, + "loss": 0.88574153, + "num_input_tokens_seen": 89582960, + "router_z_loss_mlp": 0.33984375, + "step": 1081, + "time_per_iteration": 2.6247479915618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087114, + "balance_loss_mlp": 1.05352044, + "epoch": 0.20815698345517505, + "flos": 931409929728.0, + "grad_norm": 0.05448561879445608, + "language_loss": 0.86255312, + "learning_rate": 0.0009190579259612602, + "loss": 0.87342417, + "num_input_tokens_seen": 89675488, + "router_z_loss_mlp": 0.33618164, + "step": 1082, + "time_per_iteration": 3.2661428451538086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085916, + "balance_loss_mlp": 1.05187023, + "epoch": 0.20834936514043864, + "flos": 632114205696.0, + "grad_norm": 0.059638645169798186, + "language_loss": 0.86669636, + "learning_rate": 0.000918887900703433, + "loss": 0.87755549, + "num_input_tokens_seen": 89747872, + "router_z_loss_mlp": 0.34082031, + "step": 1083, + "time_per_iteration": 2.7930080890655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083976, + "balance_loss_mlp": 1.05038285, + "epoch": 0.2085417468257022, + "flos": 394170693120.0, + "grad_norm": 0.06326041775418027, + "language_loss": 0.90427065, + "learning_rate": 0.0009187177128188999, + "loss": 0.91511047, + "num_input_tokens_seen": 89810176, + "router_z_loss_mlp": 0.33618164, + "step": 1084, + "time_per_iteration": 2.431358814239502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054848, + "balance_loss_mlp": 1.04159176, + "epoch": 0.20873412851096576, + "flos": 1401387969024.0, + "grad_norm": 0.04127554175786628, + "language_loss": 0.77156538, + "learning_rate": 0.0009185473623737339, + "loss": 0.78211385, + "num_input_tokens_seen": 90038432, + "router_z_loss_mlp": 0.1328125, + "step": 1085, + "time_per_iteration": 6.352816343307495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080772, + "balance_loss_mlp": 1.04686832, + "epoch": 0.20892651019622932, + "flos": 447599112192.0, + "grad_norm": 0.06234370040412467, + "language_loss": 0.8612783, + "learning_rate": 0.000918376849434071, + "loss": 0.87208605, + "num_input_tokens_seen": 90101568, + "router_z_loss_mlp": 0.33935547, + "step": 1086, + "time_per_iteration": 2.5168843269348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084999, + "balance_loss_mlp": 1.05040443, + "epoch": 0.20911889188149288, + "flos": 492863629824.0, + "grad_norm": 0.07820142019527274, + "language_loss": 0.90828383, + "learning_rate": 0.0009182061740661098, + "loss": 0.91913384, + "num_input_tokens_seen": 90169344, + "router_z_loss_mlp": 0.34643555, + "step": 1087, + "time_per_iteration": 2.5461525917053223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083648, + "balance_loss_mlp": 1.0494113, + "epoch": 0.20931127356675644, + "flos": 840928946688.0, + "grad_norm": 0.05821627614551514, + "language_loss": 0.85034752, + "learning_rate": 0.0009180353363361127, + "loss": 0.86118406, + "num_input_tokens_seen": 90252416, + "router_z_loss_mlp": 0.3425293, + "step": 1088, + "time_per_iteration": 3.11942982673645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084605, + "balance_loss_mlp": 1.05036855, + "epoch": 0.20950365525202, + "flos": 756796013568.0, + "grad_norm": 0.06471498550944753, + "language_loss": 0.82101512, + "learning_rate": 0.0009178643363104044, + "loss": 0.83186114, + "num_input_tokens_seen": 90337952, + "router_z_loss_mlp": 0.34277344, + "step": 1089, + "time_per_iteration": 3.0986390113830566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107905, + "balance_loss_mlp": 1.04543328, + "epoch": 0.20969603693728356, + "flos": 472301812224.0, + "grad_norm": 0.07091461504319575, + "language_loss": 0.91050649, + "learning_rate": 0.0009176931740553735, + "loss": 0.92129695, + "num_input_tokens_seen": 90401488, + "router_z_loss_mlp": 0.33642578, + "step": 1090, + "time_per_iteration": 2.4965460300445557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108219, + "balance_loss_mlp": 1.04845381, + "epoch": 0.20988841862254715, + "flos": 976507121664.0, + "grad_norm": 0.05967441428812083, + "language_loss": 0.82829833, + "learning_rate": 0.0009175218496374708, + "loss": 0.83912027, + "num_input_tokens_seen": 90486144, + "router_z_loss_mlp": 0.33740234, + "step": 1091, + "time_per_iteration": 3.325467348098755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082454, + "balance_loss_mlp": 1.04917121, + "epoch": 0.2100808003078107, + "flos": 1092697731072.0, + "grad_norm": 0.06552872916111846, + "language_loss": 0.85816884, + "learning_rate": 0.0009173503631232103, + "loss": 0.86899334, + "num_input_tokens_seen": 90571504, + "router_z_loss_mlp": 0.33300781, + "step": 1092, + "time_per_iteration": 3.3492543697357178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080972, + "balance_loss_mlp": 1.04761767, + "epoch": 0.21027318199307427, + "flos": 1012567468032.0, + "grad_norm": 0.06864870254184631, + "language_loss": 0.8205356, + "learning_rate": 0.0009171787145791691, + "loss": 0.83134532, + "num_input_tokens_seen": 90646016, + "router_z_loss_mlp": 0.33374023, + "step": 1093, + "time_per_iteration": 3.229302167892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083111, + "balance_loss_mlp": 1.04975629, + "epoch": 0.21046556367833782, + "flos": 521141216256.0, + "grad_norm": 0.08362122797221107, + "language_loss": 0.80208671, + "learning_rate": 0.000917006904071987, + "loss": 0.81291783, + "num_input_tokens_seen": 90713440, + "router_z_loss_mlp": 0.33374023, + "step": 1094, + "time_per_iteration": 2.5901217460632324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091484, + "balance_loss_mlp": 1.05843902, + "epoch": 0.21065794536360138, + "flos": 603437948928.0, + "grad_norm": 0.05523679641811596, + "language_loss": 0.87209588, + "learning_rate": 0.0009168349316683669, + "loss": 0.88301063, + "num_input_tokens_seen": 90788208, + "router_z_loss_mlp": 0.33056641, + "step": 1095, + "time_per_iteration": 2.67250919342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093141, + "balance_loss_mlp": 1.06081104, + "epoch": 0.21085032704886494, + "flos": 603045070848.0, + "grad_norm": 0.05347318487829757, + "language_loss": 0.82685143, + "learning_rate": 0.0009166627974350741, + "loss": 0.83778286, + "num_input_tokens_seen": 90873776, + "router_z_loss_mlp": 0.32324219, + "step": 1096, + "time_per_iteration": 2.9291882514953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097603, + "balance_loss_mlp": 1.06362867, + "epoch": 0.2110427087341285, + "flos": 637382697984.0, + "grad_norm": 0.059512513015867, + "language_loss": 0.89716321, + "learning_rate": 0.0009164905014389373, + "loss": 0.90813923, + "num_input_tokens_seen": 90945872, + "router_z_loss_mlp": 0.34008789, + "step": 1097, + "time_per_iteration": 2.7384700775146484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100621, + "balance_loss_mlp": 1.06798196, + "epoch": 0.21123509041939206, + "flos": 522667496448.0, + "grad_norm": 0.08051519151754843, + "language_loss": 0.87020361, + "learning_rate": 0.0009163180437468476, + "loss": 0.88120985, + "num_input_tokens_seen": 91016224, + "router_z_loss_mlp": 0.32641602, + "step": 1098, + "time_per_iteration": 2.584890365600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109632, + "balance_loss_mlp": 1.0635848, + "epoch": 0.21142747210465565, + "flos": 450938271744.0, + "grad_norm": 0.05811835985780437, + "language_loss": 0.86184567, + "learning_rate": 0.000916145424425759, + "loss": 0.87280893, + "num_input_tokens_seen": 91086752, + "router_z_loss_mlp": 0.32739258, + "step": 1099, + "time_per_iteration": 2.6362791061401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104023, + "balance_loss_mlp": 1.07059634, + "epoch": 0.2116198537899192, + "flos": 875813630976.0, + "grad_norm": 0.07623729144092387, + "language_loss": 0.9082064, + "learning_rate": 0.0009159726435426885, + "loss": 0.91924655, + "num_input_tokens_seen": 91162960, + "router_z_loss_mlp": 0.33447266, + "step": 1100, + "time_per_iteration": 3.0668158531188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108924, + "balance_loss_mlp": 1.0554564, + "epoch": 0.21181223547518277, + "flos": 523410992640.0, + "grad_norm": 0.06029059005678133, + "language_loss": 0.90809137, + "learning_rate": 0.0009157997011647154, + "loss": 0.91898382, + "num_input_tokens_seen": 91229840, + "router_z_loss_mlp": 0.33813477, + "step": 1101, + "time_per_iteration": 2.5932393074035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110663, + "balance_loss_mlp": 1.07327497, + "epoch": 0.21200461716044633, + "flos": 572014669824.0, + "grad_norm": 0.05812758027328986, + "language_loss": 0.86378956, + "learning_rate": 0.0009156265973589817, + "loss": 0.87485588, + "num_input_tokens_seen": 91307936, + "router_z_loss_mlp": 0.33374023, + "step": 1102, + "time_per_iteration": 2.79496431350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110266, + "balance_loss_mlp": 1.07672012, + "epoch": 0.2121969988457099, + "flos": 544869075456.0, + "grad_norm": 0.0704183859149776, + "language_loss": 0.89789248, + "learning_rate": 0.0009154533321926926, + "loss": 0.90899515, + "num_input_tokens_seen": 91372848, + "router_z_loss_mlp": 0.33569336, + "step": 1103, + "time_per_iteration": 2.5982048511505127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107149, + "balance_loss_mlp": 1.07393694, + "epoch": 0.21238938053097345, + "flos": 843489704448.0, + "grad_norm": 0.06399101868010165, + "language_loss": 0.87705767, + "learning_rate": 0.0009152799057331156, + "loss": 0.88812917, + "num_input_tokens_seen": 91452768, + "router_z_loss_mlp": 0.33227539, + "step": 1104, + "time_per_iteration": 3.088672637939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100871, + "balance_loss_mlp": 1.06768262, + "epoch": 0.212581762216237, + "flos": 445984081920.0, + "grad_norm": 0.064105004549741, + "language_loss": 0.91047186, + "learning_rate": 0.0009151063180475805, + "loss": 0.9214806, + "num_input_tokens_seen": 91519888, + "router_z_loss_mlp": 0.33203125, + "step": 1105, + "time_per_iteration": 2.569998025894165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090698, + "balance_loss_mlp": 1.05798697, + "epoch": 0.21277414390150057, + "flos": 514129655808.0, + "grad_norm": 0.05967324045126681, + "language_loss": 0.84732658, + "learning_rate": 0.0009149325692034803, + "loss": 0.85823357, + "num_input_tokens_seen": 91585744, + "router_z_loss_mlp": 0.32714844, + "step": 1106, + "time_per_iteration": 2.6009016036987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149918, + "balance_loss_mlp": 1.13380063, + "epoch": 0.21296652558676413, + "flos": 1484790552576.0, + "grad_norm": 0.04210654195905191, + "language_loss": 0.79203427, + "learning_rate": 0.0009147586592682702, + "loss": 0.80353343, + "num_input_tokens_seen": 91805840, + "router_z_loss_mlp": 0.16113281, + "step": 1107, + "time_per_iteration": 4.820629596710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082056, + "balance_loss_mlp": 1.04994082, + "epoch": 0.21315890727202771, + "flos": 845689669632.0, + "grad_norm": 0.07454945953507684, + "language_loss": 0.87513995, + "learning_rate": 0.0009145845883094678, + "loss": 0.88596046, + "num_input_tokens_seen": 91885936, + "router_z_loss_mlp": 0.32080078, + "step": 1108, + "time_per_iteration": 3.0311591625213623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083334, + "balance_loss_mlp": 1.04971695, + "epoch": 0.21335128895729127, + "flos": 629086376448.0, + "grad_norm": 0.07212897946446892, + "language_loss": 0.85387337, + "learning_rate": 0.000914410356394654, + "loss": 0.86470675, + "num_input_tokens_seen": 91959888, + "router_z_loss_mlp": 0.33642578, + "step": 1109, + "time_per_iteration": 2.7746968269348145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085071, + "balance_loss_mlp": 1.05102468, + "epoch": 0.21354367064255483, + "flos": 710649787392.0, + "grad_norm": 0.053148069764317206, + "language_loss": 0.85104829, + "learning_rate": 0.0009142359635914709, + "loss": 0.86189902, + "num_input_tokens_seen": 92043728, + "router_z_loss_mlp": 0.34057617, + "step": 1110, + "time_per_iteration": 3.109018564224243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083884, + "balance_loss_mlp": 1.05067194, + "epoch": 0.2137360523278184, + "flos": 455950688256.0, + "grad_norm": 0.07113647076789116, + "language_loss": 0.84692943, + "learning_rate": 0.0009140614099676245, + "loss": 0.8577683, + "num_input_tokens_seen": 92114096, + "router_z_loss_mlp": 0.33203125, + "step": 1111, + "time_per_iteration": 2.5607409477233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083673, + "balance_loss_mlp": 1.05072355, + "epoch": 0.21392843401308195, + "flos": 665749034496.0, + "grad_norm": 0.059219994241997045, + "language_loss": 0.82997137, + "learning_rate": 0.0009138866955908821, + "loss": 0.84080815, + "num_input_tokens_seen": 92193552, + "router_z_loss_mlp": 0.32958984, + "step": 1112, + "time_per_iteration": 2.901376724243164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086583, + "balance_loss_mlp": 1.05327559, + "epoch": 0.2141208156983455, + "flos": 748656843264.0, + "grad_norm": 0.06302145936378449, + "language_loss": 0.80617785, + "learning_rate": 0.0009137118205290738, + "loss": 0.81704366, + "num_input_tokens_seen": 92279248, + "router_z_loss_mlp": 0.33325195, + "step": 1113, + "time_per_iteration": 2.9629132747650146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085777, + "balance_loss_mlp": 1.05142069, + "epoch": 0.21431319738360907, + "flos": 418898124288.0, + "grad_norm": 0.06913372638273338, + "language_loss": 0.90778732, + "learning_rate": 0.0009135367848500924, + "loss": 0.91864502, + "num_input_tokens_seen": 92344064, + "router_z_loss_mlp": 0.34399414, + "step": 1114, + "time_per_iteration": 2.5860419273376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087087, + "balance_loss_mlp": 1.05406582, + "epoch": 0.21450557906887263, + "flos": 608849035776.0, + "grad_norm": 0.07370492115341919, + "language_loss": 0.86567605, + "learning_rate": 0.0009133615886218927, + "loss": 0.87654686, + "num_input_tokens_seen": 92410544, + "router_z_loss_mlp": 0.33032227, + "step": 1115, + "time_per_iteration": 2.6986265182495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095845, + "balance_loss_mlp": 1.06082106, + "epoch": 0.21469796075413622, + "flos": 561649393152.0, + "grad_norm": 0.0682239504380638, + "language_loss": 0.88444531, + "learning_rate": 0.0009131862319124917, + "loss": 0.89540386, + "num_input_tokens_seen": 92480272, + "router_z_loss_mlp": 0.3503418, + "step": 1116, + "time_per_iteration": 2.644977569580078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086331, + "balance_loss_mlp": 1.0540725, + "epoch": 0.21489034243939978, + "flos": 594363225600.0, + "grad_norm": 0.06937847326766512, + "language_loss": 0.8429122, + "learning_rate": 0.0009130107147899691, + "loss": 0.85377544, + "num_input_tokens_seen": 92555584, + "router_z_loss_mlp": 0.32250977, + "step": 1117, + "time_per_iteration": 2.768064498901367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084886, + "balance_loss_mlp": 1.05148315, + "epoch": 0.21508272412466334, + "flos": 441661317120.0, + "grad_norm": 0.09911577685113587, + "language_loss": 0.85504615, + "learning_rate": 0.0009128350373224665, + "loss": 0.86589503, + "num_input_tokens_seen": 92623136, + "router_z_loss_mlp": 0.33422852, + "step": 1118, + "time_per_iteration": 2.5369865894317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_mlp": 1.02206326, + "epoch": 0.2152751058099269, + "flos": 1495397348352.0, + "grad_norm": 0.028624916140058014, + "language_loss": 0.81456429, + "learning_rate": 0.0009126591995781883, + "loss": 0.82490271, + "num_input_tokens_seen": 92842608, + "router_z_loss_mlp": 0.11767578, + "step": 1119, + "time_per_iteration": 4.634536266326904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109091, + "balance_loss_mlp": 1.05741262, + "epoch": 0.21546748749519046, + "flos": 493759895040.0, + "grad_norm": 0.057336284262766976, + "language_loss": 0.85470641, + "learning_rate": 0.0009124832016254005, + "loss": 0.86561549, + "num_input_tokens_seen": 92912960, + "router_z_loss_mlp": 0.33520508, + "step": 1120, + "time_per_iteration": 2.57099986076355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097048, + "balance_loss_mlp": 1.06245303, + "epoch": 0.21565986918045402, + "flos": 634241387520.0, + "grad_norm": 0.0556622286599547, + "language_loss": 0.8842063, + "learning_rate": 0.0009123070435324316, + "loss": 0.89517677, + "num_input_tokens_seen": 92982272, + "router_z_loss_mlp": 0.34619141, + "step": 1121, + "time_per_iteration": 2.73698091506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010392, + "balance_loss_mlp": 1.02780366, + "epoch": 0.21585225086571758, + "flos": 1582502704128.0, + "grad_norm": 0.024824935431588098, + "language_loss": 0.77875781, + "learning_rate": 0.0009121307253676722, + "loss": 0.78914982, + "num_input_tokens_seen": 93218752, + "router_z_loss_mlp": 0.11376953, + "step": 1122, + "time_per_iteration": 4.960963010787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093163, + "balance_loss_mlp": 1.05897415, + "epoch": 0.21604463255098114, + "flos": 683799556608.0, + "grad_norm": 0.06637115500638362, + "language_loss": 0.86772728, + "learning_rate": 0.0009119542471995752, + "loss": 0.87865889, + "num_input_tokens_seen": 93293968, + "router_z_loss_mlp": 0.34204102, + "step": 1123, + "time_per_iteration": 2.819042205810547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109659, + "balance_loss_mlp": 1.06228209, + "epoch": 0.2162370142362447, + "flos": 780660675072.0, + "grad_norm": 0.06221084946299637, + "language_loss": 0.81623554, + "learning_rate": 0.0009117776090966554, + "loss": 0.82720149, + "num_input_tokens_seen": 93367088, + "router_z_loss_mlp": 0.34326172, + "step": 1124, + "time_per_iteration": 2.9435975551605225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090546, + "balance_loss_mlp": 1.05473578, + "epoch": 0.21642939592150828, + "flos": 1001745294336.0, + "grad_norm": 0.06219513600405685, + "language_loss": 0.86821365, + "learning_rate": 0.0009116008111274899, + "loss": 0.8791191, + "num_input_tokens_seen": 93452944, + "router_z_loss_mlp": 0.35839844, + "step": 1125, + "time_per_iteration": 3.250828504562378 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01023544, + "balance_loss_mlp": 1.01271951, + "epoch": 0.21662177760677184, + "flos": 1481867440128.0, + "grad_norm": 0.013492425774453086, + "language_loss": 0.79106927, + "learning_rate": 0.0009114238533607176, + "loss": 0.8013047, + "num_input_tokens_seen": 93677328, + "router_z_loss_mlp": 0.10839844, + "step": 1126, + "time_per_iteration": 4.836662530899048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078043, + "balance_loss_mlp": 1.0431627, + "epoch": 0.2168141592920354, + "flos": 887030092800.0, + "grad_norm": 0.06408405180788145, + "language_loss": 0.84878719, + "learning_rate": 0.0009112467358650396, + "loss": 0.85956764, + "num_input_tokens_seen": 93756848, + "router_z_loss_mlp": 0.34912109, + "step": 1127, + "time_per_iteration": 3.118460178375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087154, + "balance_loss_mlp": 1.05205846, + "epoch": 0.21700654097729896, + "flos": 545682382848.0, + "grad_norm": 0.06014422622436645, + "language_loss": 0.86521864, + "learning_rate": 0.0009110694587092192, + "loss": 0.87609017, + "num_input_tokens_seen": 93834704, + "router_z_loss_mlp": 0.35131836, + "step": 1128, + "time_per_iteration": 2.736814022064209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080165, + "balance_loss_mlp": 1.0446167, + "epoch": 0.21719892266256252, + "flos": 509270008320.0, + "grad_norm": 0.06606219668196793, + "language_loss": 0.81429344, + "learning_rate": 0.0009108920219620815, + "loss": 0.82509506, + "num_input_tokens_seen": 93904448, + "router_z_loss_mlp": 0.35571289, + "step": 1129, + "time_per_iteration": 2.6214489936828613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083138, + "balance_loss_mlp": 1.04782772, + "epoch": 0.21739130434782608, + "flos": 543150738432.0, + "grad_norm": 0.060577581075914995, + "language_loss": 0.89903116, + "learning_rate": 0.0009107144256925133, + "loss": 0.90986252, + "num_input_tokens_seen": 93979312, + "router_z_loss_mlp": 0.35302734, + "step": 1130, + "time_per_iteration": 2.6337971687316895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079633, + "balance_loss_mlp": 1.04489541, + "epoch": 0.21758368603308964, + "flos": 616564804608.0, + "grad_norm": 0.0674499307688184, + "language_loss": 0.82610142, + "learning_rate": 0.0009105366699694638, + "loss": 0.83689773, + "num_input_tokens_seen": 94052032, + "router_z_loss_mlp": 0.34790039, + "step": 1131, + "time_per_iteration": 2.6984267234802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085371, + "balance_loss_mlp": 1.04979873, + "epoch": 0.2177760677183532, + "flos": 634813175808.0, + "grad_norm": 0.051829013054278075, + "language_loss": 0.8159321, + "learning_rate": 0.0009103587548619439, + "loss": 0.8267858, + "num_input_tokens_seen": 94124944, + "router_z_loss_mlp": 0.35571289, + "step": 1132, + "time_per_iteration": 2.8308732509613037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083319, + "balance_loss_mlp": 1.04850996, + "epoch": 0.2179684494036168, + "flos": 532181587968.0, + "grad_norm": 0.06772780520844247, + "language_loss": 0.86115086, + "learning_rate": 0.0009101806804390261, + "loss": 0.87198412, + "num_input_tokens_seen": 94200384, + "router_z_loss_mlp": 0.34863281, + "step": 1133, + "time_per_iteration": 2.7745282649993896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081903, + "balance_loss_mlp": 1.04671264, + "epoch": 0.21816083108888035, + "flos": 474980433408.0, + "grad_norm": 0.05911376481567057, + "language_loss": 0.90451765, + "learning_rate": 0.0009100024467698453, + "loss": 0.91533667, + "num_input_tokens_seen": 94266992, + "router_z_loss_mlp": 0.35205078, + "step": 1134, + "time_per_iteration": 2.551278829574585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086865, + "balance_loss_mlp": 1.051054, + "epoch": 0.2183532127741439, + "flos": 577198794240.0, + "grad_norm": 0.07962415284192025, + "language_loss": 0.83050048, + "learning_rate": 0.0009098240539235981, + "loss": 0.84136909, + "num_input_tokens_seen": 94334304, + "router_z_loss_mlp": 0.3581543, + "step": 1135, + "time_per_iteration": 2.660019636154175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086508, + "balance_loss_mlp": 1.05172312, + "epoch": 0.21854559445940747, + "flos": 593832135168.0, + "grad_norm": 0.05867668726509775, + "language_loss": 0.87679315, + "learning_rate": 0.0009096455019695423, + "loss": 0.88765824, + "num_input_tokens_seen": 94413296, + "router_z_loss_mlp": 0.34838867, + "step": 1136, + "time_per_iteration": 2.756463050842285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090029, + "balance_loss_mlp": 1.05426645, + "epoch": 0.21873797614467103, + "flos": 408464446464.0, + "grad_norm": 0.06290439978907646, + "language_loss": 0.90092266, + "learning_rate": 0.000909466790976998, + "loss": 0.91182297, + "num_input_tokens_seen": 94475840, + "router_z_loss_mlp": 0.35791016, + "step": 1137, + "time_per_iteration": 2.5046186447143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089401, + "balance_loss_mlp": 1.05373359, + "epoch": 0.21893035782993459, + "flos": 893824865280.0, + "grad_norm": 0.05253297698454947, + "language_loss": 0.83030021, + "learning_rate": 0.0009092879210153473, + "loss": 0.84119421, + "num_input_tokens_seen": 94555184, + "router_z_loss_mlp": 0.35668945, + "step": 1138, + "time_per_iteration": 3.1294023990631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092735, + "balance_loss_mlp": 1.05835557, + "epoch": 0.21912273951519814, + "flos": 467392702464.0, + "grad_norm": 0.05516730570048504, + "language_loss": 0.88930631, + "learning_rate": 0.0009091088921540333, + "loss": 0.90023363, + "num_input_tokens_seen": 94622656, + "router_z_loss_mlp": 0.34399414, + "step": 1139, + "time_per_iteration": 2.5161380767822266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081896, + "balance_loss_mlp": 1.06921172, + "epoch": 0.2193151212004617, + "flos": 1531262665728.0, + "grad_norm": 0.036356034107047845, + "language_loss": 0.75508678, + "learning_rate": 0.0009089297044625615, + "loss": 0.76590574, + "num_input_tokens_seen": 94856496, + "router_z_loss_mlp": 0.12695312, + "step": 1140, + "time_per_iteration": 4.929131984710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087847, + "balance_loss_mlp": 1.05306172, + "epoch": 0.2195075028857253, + "flos": 590901820416.0, + "grad_norm": 0.07364984820319191, + "language_loss": 0.8488574, + "learning_rate": 0.0009087503580104985, + "loss": 0.85973585, + "num_input_tokens_seen": 94926880, + "router_z_loss_mlp": 0.34814453, + "step": 1141, + "time_per_iteration": 2.676321029663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087752, + "balance_loss_mlp": 1.05287158, + "epoch": 0.21969988457098885, + "flos": 636033917952.0, + "grad_norm": 0.0662048159418312, + "language_loss": 0.79610777, + "learning_rate": 0.0009085708528674728, + "loss": 0.80698538, + "num_input_tokens_seen": 95000528, + "router_z_loss_mlp": 0.34912109, + "step": 1142, + "time_per_iteration": 2.7667393684387207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088746, + "balance_loss_mlp": 1.05202913, + "epoch": 0.2198922662562524, + "flos": 911974311936.0, + "grad_norm": 0.07907290305355467, + "language_loss": 0.86086833, + "learning_rate": 0.0009083911891031745, + "loss": 0.87175578, + "num_input_tokens_seen": 95081040, + "router_z_loss_mlp": 0.36743164, + "step": 1143, + "time_per_iteration": 3.1026079654693604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093334, + "balance_loss_mlp": 1.05809617, + "epoch": 0.22008464794151597, + "flos": 822603409920.0, + "grad_norm": 0.06284406217527433, + "language_loss": 0.91362917, + "learning_rate": 0.0009082113667873553, + "loss": 0.92456251, + "num_input_tokens_seen": 95167328, + "router_z_loss_mlp": 0.3527832, + "step": 1144, + "time_per_iteration": 3.098741292953491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107482, + "balance_loss_mlp": 1.07188582, + "epoch": 0.22027702962677953, + "flos": 459416475648.0, + "grad_norm": 0.06625631151069579, + "language_loss": 0.90562177, + "learning_rate": 0.0009080313859898283, + "loss": 0.91669661, + "num_input_tokens_seen": 95230304, + "router_z_loss_mlp": 0.35620117, + "step": 1145, + "time_per_iteration": 2.4998207092285156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111259, + "balance_loss_mlp": 1.07606888, + "epoch": 0.2204694113120431, + "flos": 530998723584.0, + "grad_norm": 0.05051092763003013, + "language_loss": 0.91815794, + "learning_rate": 0.0009078512467804684, + "loss": 0.92927051, + "num_input_tokens_seen": 95299520, + "router_z_loss_mlp": 0.35180664, + "step": 1146, + "time_per_iteration": 2.569073438644409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117194, + "balance_loss_mlp": 1.08162224, + "epoch": 0.22066179299730665, + "flos": 522382307328.0, + "grad_norm": 0.06837547739928014, + "language_loss": 0.90610331, + "learning_rate": 0.0009076709492292119, + "loss": 0.91727525, + "num_input_tokens_seen": 95368912, + "router_z_loss_mlp": 0.35571289, + "step": 1147, + "time_per_iteration": 2.614039659500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114459, + "balance_loss_mlp": 1.07969809, + "epoch": 0.2208541746825702, + "flos": 546188742144.0, + "grad_norm": 0.06837160959472317, + "language_loss": 0.89193797, + "learning_rate": 0.0009074904934060562, + "loss": 0.90308249, + "num_input_tokens_seen": 95440800, + "router_z_loss_mlp": 0.34790039, + "step": 1148, + "time_per_iteration": 2.6419012546539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112121, + "balance_loss_mlp": 1.08578134, + "epoch": 0.22104655636783377, + "flos": 708404742144.0, + "grad_norm": 0.07108081727062696, + "language_loss": 0.84988266, + "learning_rate": 0.0009073098793810607, + "loss": 0.86109483, + "num_input_tokens_seen": 95519904, + "router_z_loss_mlp": 0.35473633, + "step": 1149, + "time_per_iteration": 2.909515142440796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116065, + "balance_loss_mlp": 1.08061242, + "epoch": 0.22123893805309736, + "flos": 584594468352.0, + "grad_norm": 0.07695680382665727, + "language_loss": 0.88374794, + "learning_rate": 0.000907129107224346, + "loss": 0.89490861, + "num_input_tokens_seen": 95591568, + "router_z_loss_mlp": 0.35522461, + "step": 1150, + "time_per_iteration": 2.7029008865356445 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099143, + "balance_loss_mlp": 1.06369042, + "epoch": 0.22143131973836092, + "flos": 492002270208.0, + "grad_norm": 0.049049579749502144, + "language_loss": 0.88305712, + "learning_rate": 0.0009069481770060939, + "loss": 0.89404863, + "num_input_tokens_seen": 95664480, + "router_z_loss_mlp": 0.35498047, + "step": 1151, + "time_per_iteration": 2.65167236328125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097606, + "balance_loss_mlp": 1.06248736, + "epoch": 0.22162370142362448, + "flos": 1079227459584.0, + "grad_norm": 0.054063738490033035, + "language_loss": 0.84240663, + "learning_rate": 0.000906767088796548, + "loss": 0.85338271, + "num_input_tokens_seen": 95754400, + "router_z_loss_mlp": 0.35180664, + "step": 1152, + "time_per_iteration": 3.423985004425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110736, + "balance_loss_mlp": 1.07300401, + "epoch": 0.22181608310888803, + "flos": 492258345984.0, + "grad_norm": 0.057939830998464815, + "language_loss": 0.87012136, + "learning_rate": 0.0009065858426660127, + "loss": 0.88119501, + "num_input_tokens_seen": 95826944, + "router_z_loss_mlp": 0.34399414, + "step": 1153, + "time_per_iteration": 2.5987319946289062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108178, + "balance_loss_mlp": 1.07355952, + "epoch": 0.2220084647941516, + "flos": 723687892992.0, + "grad_norm": 0.0653796708212952, + "language_loss": 0.84926325, + "learning_rate": 0.0009064044386848543, + "loss": 0.86034507, + "num_input_tokens_seen": 95902688, + "router_z_loss_mlp": 0.34667969, + "step": 1154, + "time_per_iteration": 2.9024224281311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112372, + "balance_loss_mlp": 1.0753932, + "epoch": 0.22220084647941515, + "flos": 488988997632.0, + "grad_norm": 0.06606878403176955, + "language_loss": 0.88905716, + "learning_rate": 0.0009062228769234997, + "loss": 0.90018088, + "num_input_tokens_seen": 95969952, + "router_z_loss_mlp": 0.36987305, + "step": 1155, + "time_per_iteration": 2.5483920574188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104305, + "balance_loss_mlp": 1.06868565, + "epoch": 0.2223932281646787, + "flos": 536025696768.0, + "grad_norm": 0.06185680569649912, + "language_loss": 0.81360811, + "learning_rate": 0.0009060411574524376, + "loss": 0.82465118, + "num_input_tokens_seen": 96037344, + "router_z_loss_mlp": 0.35644531, + "step": 1156, + "time_per_iteration": 2.629166841506958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114341, + "balance_loss_mlp": 1.0794363, + "epoch": 0.22258560984994227, + "flos": 931034580480.0, + "grad_norm": 0.06288530021121215, + "language_loss": 0.88191485, + "learning_rate": 0.0009058592803422178, + "loss": 0.8930583, + "num_input_tokens_seen": 96115616, + "router_z_loss_mlp": 0.34936523, + "step": 1157, + "time_per_iteration": 3.133453845977783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01219449, + "balance_loss_mlp": 1.20495331, + "epoch": 0.22277799153520586, + "flos": 1198998443520.0, + "grad_norm": 0.06392494715081258, + "language_loss": 0.78710288, + "learning_rate": 0.0009056772456634512, + "loss": 0.79929739, + "num_input_tokens_seen": 96333600, + "router_z_loss_mlp": 0.14453125, + "step": 1158, + "time_per_iteration": 4.838433027267456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095231, + "balance_loss_mlp": 1.0620904, + "epoch": 0.22297037322046942, + "flos": 501052262400.0, + "grad_norm": 0.059439708082357066, + "language_loss": 0.90095651, + "learning_rate": 0.00090549505348681, + "loss": 0.91190875, + "num_input_tokens_seen": 96402544, + "router_z_loss_mlp": 0.33154297, + "step": 1159, + "time_per_iteration": 2.561887264251709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083603, + "balance_loss_mlp": 1.04977143, + "epoch": 0.22316275490573298, + "flos": 752413612032.0, + "grad_norm": 0.05610915875378834, + "language_loss": 0.84254742, + "learning_rate": 0.0009053127038830275, + "loss": 0.85338354, + "num_input_tokens_seen": 96487600, + "router_z_loss_mlp": 0.33862305, + "step": 1160, + "time_per_iteration": 2.9465925693511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082896, + "balance_loss_mlp": 1.04844403, + "epoch": 0.22335513659099654, + "flos": 514553057280.0, + "grad_norm": 0.06657410760601727, + "language_loss": 0.87182009, + "learning_rate": 0.000905130196922898, + "loss": 0.88264906, + "num_input_tokens_seen": 96554912, + "router_z_loss_mlp": 0.3449707, + "step": 1161, + "time_per_iteration": 2.597325325012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076013, + "balance_loss_mlp": 1.04213357, + "epoch": 0.2235475182762601, + "flos": 484286501376.0, + "grad_norm": 0.057467913173926514, + "language_loss": 0.87228084, + "learning_rate": 0.0009049475326772769, + "loss": 0.88304096, + "num_input_tokens_seen": 96624192, + "router_z_loss_mlp": 0.33911133, + "step": 1162, + "time_per_iteration": 2.591592788696289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107232, + "balance_loss_mlp": 1.0379163, + "epoch": 0.22373989996152366, + "flos": 469698794496.0, + "grad_norm": 0.05481831884816676, + "language_loss": 0.83362567, + "learning_rate": 0.0009047647112170811, + "loss": 0.84434885, + "num_input_tokens_seen": 96701040, + "router_z_loss_mlp": 0.34448242, + "step": 1163, + "time_per_iteration": 2.7466936111450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080512, + "balance_loss_mlp": 1.04503536, + "epoch": 0.22393228164678722, + "flos": 1270512594432.0, + "grad_norm": 0.0775991801606853, + "language_loss": 0.87402856, + "learning_rate": 0.0009045817326132876, + "loss": 0.88483369, + "num_input_tokens_seen": 96791200, + "router_z_loss_mlp": 0.35498047, + "step": 1164, + "time_per_iteration": 3.6615524291992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082563, + "balance_loss_mlp": 1.04665732, + "epoch": 0.22412466333205078, + "flos": 596052449280.0, + "grad_norm": 0.05603114612800397, + "language_loss": 0.83484542, + "learning_rate": 0.0009043985969369357, + "loss": 0.84567106, + "num_input_tokens_seen": 96869360, + "router_z_loss_mlp": 0.35913086, + "step": 1165, + "time_per_iteration": 2.800389528274536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084793, + "balance_loss_mlp": 1.047647, + "epoch": 0.22431704501731436, + "flos": 608136062976.0, + "grad_norm": 0.052919924442321326, + "language_loss": 0.84423298, + "learning_rate": 0.0009042153042591245, + "loss": 0.85508084, + "num_input_tokens_seen": 96945840, + "router_z_loss_mlp": 0.37158203, + "step": 1166, + "time_per_iteration": 2.7848384380340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080872, + "balance_loss_mlp": 1.04622972, + "epoch": 0.22450942670257792, + "flos": 906203842560.0, + "grad_norm": 0.054053491984114646, + "language_loss": 0.85318398, + "learning_rate": 0.0009040318546510146, + "loss": 0.86399269, + "num_input_tokens_seen": 97029296, + "router_z_loss_mlp": 0.34667969, + "step": 1167, + "time_per_iteration": 3.1406538486480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080871, + "balance_loss_mlp": 1.04529881, + "epoch": 0.22470180838784148, + "flos": 565032222720.0, + "grad_norm": 0.06590224184570584, + "language_loss": 0.85490131, + "learning_rate": 0.0009038482481838275, + "loss": 0.86571002, + "num_input_tokens_seen": 97097776, + "router_z_loss_mlp": 0.35620117, + "step": 1168, + "time_per_iteration": 2.6623363494873047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107832, + "balance_loss_mlp": 1.04265296, + "epoch": 0.22489419007310504, + "flos": 834109443072.0, + "grad_norm": 0.05295244004415107, + "language_loss": 0.87364161, + "learning_rate": 0.0009036644849288455, + "loss": 0.88442481, + "num_input_tokens_seen": 97181424, + "router_z_loss_mlp": 0.35668945, + "step": 1169, + "time_per_iteration": 3.096397638320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082872, + "balance_loss_mlp": 1.04567838, + "epoch": 0.2250865717583686, + "flos": 580788237312.0, + "grad_norm": 0.06189616009675637, + "language_loss": 0.85257494, + "learning_rate": 0.0009034805649574118, + "loss": 0.86340362, + "num_input_tokens_seen": 97252128, + "router_z_loss_mlp": 0.37207031, + "step": 1170, + "time_per_iteration": 2.655629873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081931, + "balance_loss_mlp": 1.04669285, + "epoch": 0.22527895344363216, + "flos": 600091435008.0, + "grad_norm": 0.0574349936504533, + "language_loss": 0.85081124, + "learning_rate": 0.0009032964883409308, + "loss": 0.86163056, + "num_input_tokens_seen": 97326640, + "router_z_loss_mlp": 0.35253906, + "step": 1171, + "time_per_iteration": 2.9228479862213135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096453, + "balance_loss_mlp": 1.08109915, + "epoch": 0.22547133512889572, + "flos": 1440009073152.0, + "grad_norm": 0.03435009764288223, + "language_loss": 0.73050535, + "learning_rate": 0.000903112255150867, + "loss": 0.74146986, + "num_input_tokens_seen": 97553952, + "router_z_loss_mlp": 0.15332031, + "step": 1172, + "time_per_iteration": 4.9870195388793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099751, + "balance_loss_mlp": 1.06365418, + "epoch": 0.22566371681415928, + "flos": 490377065472.0, + "grad_norm": 0.06750207251725504, + "language_loss": 0.87597418, + "learning_rate": 0.0009029278654587462, + "loss": 0.88697171, + "num_input_tokens_seen": 97623584, + "router_z_loss_mlp": 0.36108398, + "step": 1173, + "time_per_iteration": 2.545078754425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098309, + "balance_loss_mlp": 1.06156898, + "epoch": 0.22585609849942284, + "flos": 604334214144.0, + "grad_norm": 0.06244795934891309, + "language_loss": 0.82517409, + "learning_rate": 0.0009027433193361548, + "loss": 0.8361572, + "num_input_tokens_seen": 97695952, + "router_z_loss_mlp": 0.36767578, + "step": 1174, + "time_per_iteration": 2.69753098487854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100346, + "balance_loss_mlp": 1.06305695, + "epoch": 0.22604848018468643, + "flos": 635280247296.0, + "grad_norm": 0.06854123529633785, + "language_loss": 0.87138826, + "learning_rate": 0.00090255861685474, + "loss": 0.88239175, + "num_input_tokens_seen": 97764544, + "router_z_loss_mlp": 0.37280273, + "step": 1175, + "time_per_iteration": 2.7199149131774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094451, + "balance_loss_mlp": 1.05744886, + "epoch": 0.22624086186995, + "flos": 479633467392.0, + "grad_norm": 0.06836538183258173, + "language_loss": 0.91474092, + "learning_rate": 0.0009023737580862095, + "loss": 0.92568541, + "num_input_tokens_seen": 97830976, + "router_z_loss_mlp": 0.36962891, + "step": 1176, + "time_per_iteration": 2.51255464553833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092025, + "balance_loss_mlp": 1.0570724, + "epoch": 0.22643324355521355, + "flos": 495566982144.0, + "grad_norm": 0.05906016973995859, + "language_loss": 0.83066601, + "learning_rate": 0.0009021887431023321, + "loss": 0.84158623, + "num_input_tokens_seen": 97898800, + "router_z_loss_mlp": 0.34985352, + "step": 1177, + "time_per_iteration": 2.5783960819244385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084144, + "balance_loss_mlp": 1.04842877, + "epoch": 0.2266256252404771, + "flos": 561271071744.0, + "grad_norm": 0.05542928649781209, + "language_loss": 0.87720597, + "learning_rate": 0.0009020035719749369, + "loss": 0.8880474, + "num_input_tokens_seen": 97974112, + "router_z_loss_mlp": 0.35742188, + "step": 1178, + "time_per_iteration": 2.7076900005340576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088743, + "balance_loss_mlp": 1.05259871, + "epoch": 0.22681800692574067, + "flos": 579353527296.0, + "grad_norm": 0.05892405405909356, + "language_loss": 0.77506709, + "learning_rate": 0.0009018182447759136, + "loss": 0.78595448, + "num_input_tokens_seen": 98056640, + "router_z_loss_mlp": 0.36157227, + "step": 1179, + "time_per_iteration": 2.974362373352051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083275, + "balance_loss_mlp": 1.04798961, + "epoch": 0.22701038861100423, + "flos": 739842577920.0, + "grad_norm": 0.0555118465290956, + "language_loss": 0.80168724, + "learning_rate": 0.0009016327615772126, + "loss": 0.81252003, + "num_input_tokens_seen": 98135952, + "router_z_loss_mlp": 0.35327148, + "step": 1180, + "time_per_iteration": 2.9207658767700195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082522, + "balance_loss_mlp": 1.04776096, + "epoch": 0.2272027702962678, + "flos": 576996562944.0, + "grad_norm": 0.06857059729731818, + "language_loss": 0.88146389, + "learning_rate": 0.0009014471224508451, + "loss": 0.8922891, + "num_input_tokens_seen": 98204288, + "router_z_loss_mlp": 0.34790039, + "step": 1181, + "time_per_iteration": 2.6884429454803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080826, + "balance_loss_mlp": 1.0466603, + "epoch": 0.22739515198153135, + "flos": 544012098048.0, + "grad_norm": 0.07386093909180869, + "language_loss": 0.83020878, + "learning_rate": 0.0009012613274688823, + "loss": 0.84101701, + "num_input_tokens_seen": 98269856, + "router_z_loss_mlp": 0.34204102, + "step": 1182, + "time_per_iteration": 2.625973701477051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082925, + "balance_loss_mlp": 1.04735291, + "epoch": 0.22758753366679493, + "flos": 439932805632.0, + "grad_norm": 0.06621157637783351, + "language_loss": 0.87839937, + "learning_rate": 0.0009010753767034565, + "loss": 0.88922858, + "num_input_tokens_seen": 98335632, + "router_z_loss_mlp": 0.35571289, + "step": 1183, + "time_per_iteration": 2.545569658279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086712, + "balance_loss_mlp": 1.05030489, + "epoch": 0.2277799153520585, + "flos": 729104772096.0, + "grad_norm": 0.07242159959797279, + "language_loss": 0.79501748, + "learning_rate": 0.0009008892702267599, + "loss": 0.80588454, + "num_input_tokens_seen": 98420592, + "router_z_loss_mlp": 0.36425781, + "step": 1184, + "time_per_iteration": 2.9862120151519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089174, + "balance_loss_mlp": 1.05322075, + "epoch": 0.22797229703732205, + "flos": 526641053184.0, + "grad_norm": 0.0740207336504876, + "language_loss": 0.89059424, + "learning_rate": 0.0009007030081110457, + "loss": 0.90148592, + "num_input_tokens_seen": 98488096, + "router_z_loss_mlp": 0.35961914, + "step": 1185, + "time_per_iteration": 2.6184284687042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083236, + "balance_loss_mlp": 1.04783034, + "epoch": 0.2281646787225856, + "flos": 535159954944.0, + "grad_norm": 0.06479663876551665, + "language_loss": 0.84969211, + "learning_rate": 0.000900516590428627, + "loss": 0.86052454, + "num_input_tokens_seen": 98561664, + "router_z_loss_mlp": 0.35449219, + "step": 1186, + "time_per_iteration": 2.724161386489868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083897, + "balance_loss_mlp": 1.049088, + "epoch": 0.22835706040784917, + "flos": 541107924480.0, + "grad_norm": 0.052728830082858405, + "language_loss": 0.89177948, + "learning_rate": 0.0009003300172518778, + "loss": 0.90261841, + "num_input_tokens_seen": 98634336, + "router_z_loss_mlp": 0.34790039, + "step": 1187, + "time_per_iteration": 2.6810121536254883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108576, + "balance_loss_mlp": 1.05018783, + "epoch": 0.22854944209311273, + "flos": 790297012224.0, + "grad_norm": 0.05376177869775473, + "language_loss": 0.84676045, + "learning_rate": 0.0009001432886532321, + "loss": 0.85761803, + "num_input_tokens_seen": 98709600, + "router_z_loss_mlp": 0.35571289, + "step": 1188, + "time_per_iteration": 2.977433919906616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109735, + "balance_loss_mlp": 1.0614686, + "epoch": 0.2287418237783763, + "flos": 469047020544.0, + "grad_norm": 0.06589135500726684, + "language_loss": 0.86752445, + "learning_rate": 0.0008999564047051843, + "loss": 0.87849802, + "num_input_tokens_seen": 98775024, + "router_z_loss_mlp": 0.35913086, + "step": 1189, + "time_per_iteration": 2.5126237869262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094774, + "balance_loss_mlp": 1.06017935, + "epoch": 0.22893420546363985, + "flos": 467786990592.0, + "grad_norm": 0.061551577223012334, + "language_loss": 0.84713042, + "learning_rate": 0.0008997693654802894, + "loss": 0.85807812, + "num_input_tokens_seen": 98845248, + "router_z_loss_mlp": 0.34643555, + "step": 1190, + "time_per_iteration": 2.6570322513580322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088276, + "balance_loss_mlp": 1.05318046, + "epoch": 0.22912658714890344, + "flos": 625974179328.0, + "grad_norm": 0.05326512300588333, + "language_loss": 0.86549705, + "learning_rate": 0.0008995821710511625, + "loss": 0.87637979, + "num_input_tokens_seen": 98913584, + "router_z_loss_mlp": 0.35107422, + "step": 1191, + "time_per_iteration": 2.8115806579589844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108036, + "balance_loss_mlp": 1.04731488, + "epoch": 0.229318968834167, + "flos": 502785156096.0, + "grad_norm": 0.06330680163661413, + "language_loss": 0.8511278, + "learning_rate": 0.0008993948214904786, + "loss": 0.86193144, + "num_input_tokens_seen": 98978608, + "router_z_loss_mlp": 0.33056641, + "step": 1192, + "time_per_iteration": 2.546410083770752 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125589, + "balance_loss_mlp": 1.11023474, + "epoch": 0.22951135051943056, + "flos": 1374108544512.0, + "grad_norm": 0.06153086464986019, + "language_loss": 0.78422213, + "learning_rate": 0.0008992073168709733, + "loss": 0.79547799, + "num_input_tokens_seen": 99207424, + "router_z_loss_mlp": 0.15332031, + "step": 1193, + "time_per_iteration": 4.891269207000732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099997, + "balance_loss_mlp": 1.06306624, + "epoch": 0.22970373220469412, + "flos": 644045050368.0, + "grad_norm": 0.06658536787009234, + "language_loss": 0.79028845, + "learning_rate": 0.0008990196572654427, + "loss": 0.80128849, + "num_input_tokens_seen": 99290592, + "router_z_loss_mlp": 0.36914062, + "step": 1194, + "time_per_iteration": 2.8504366874694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100467, + "balance_loss_mlp": 1.06582475, + "epoch": 0.22989611388995768, + "flos": 499945001472.0, + "grad_norm": 0.048025217626556156, + "language_loss": 0.87748766, + "learning_rate": 0.0008988318427467426, + "loss": 0.88849235, + "num_input_tokens_seen": 99366096, + "router_z_loss_mlp": 0.34667969, + "step": 1195, + "time_per_iteration": 2.735084056854248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099859, + "balance_loss_mlp": 1.06524014, + "epoch": 0.23008849557522124, + "flos": 1096071796224.0, + "grad_norm": 0.06731751876810108, + "language_loss": 0.86263168, + "learning_rate": 0.0008986438733877887, + "loss": 0.87363023, + "num_input_tokens_seen": 99456768, + "router_z_loss_mlp": 0.34667969, + "step": 1196, + "time_per_iteration": 3.435035228729248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100924, + "balance_loss_mlp": 1.06606746, + "epoch": 0.2302808772604848, + "flos": 683313546240.0, + "grad_norm": 0.04733445604251135, + "language_loss": 0.84099567, + "learning_rate": 0.0008984557492615576, + "loss": 0.85200489, + "num_input_tokens_seen": 99539616, + "router_z_loss_mlp": 0.34887695, + "step": 1197, + "time_per_iteration": 2.927668809890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107534, + "balance_loss_mlp": 1.07317793, + "epoch": 0.23047325894574835, + "flos": 528664928256.0, + "grad_norm": 0.0630370564804667, + "language_loss": 0.89949608, + "learning_rate": 0.0008982674704410854, + "loss": 0.91057146, + "num_input_tokens_seen": 99612064, + "router_z_loss_mlp": 0.34399414, + "step": 1198, + "time_per_iteration": 2.691016435623169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107479, + "balance_loss_mlp": 1.07228875, + "epoch": 0.23066564063101191, + "flos": 682427455488.0, + "grad_norm": 0.06209648084563375, + "language_loss": 0.77829844, + "learning_rate": 0.0008980790369994682, + "loss": 0.78937328, + "num_input_tokens_seen": 99691040, + "router_z_loss_mlp": 0.35205078, + "step": 1199, + "time_per_iteration": 2.9320883750915527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100492, + "balance_loss_mlp": 1.06525421, + "epoch": 0.2308580223162755, + "flos": 558247624704.0, + "grad_norm": 0.09180159748966574, + "language_loss": 0.87396461, + "learning_rate": 0.000897890449009863, + "loss": 0.88496947, + "num_input_tokens_seen": 99762016, + "router_z_loss_mlp": 0.3527832, + "step": 1200, + "time_per_iteration": 2.6804869174957275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096306, + "balance_loss_mlp": 1.06183052, + "epoch": 0.23105040400153906, + "flos": 555406060032.0, + "grad_norm": 0.05982856494430897, + "language_loss": 0.90313268, + "learning_rate": 0.0008977017065454853, + "loss": 0.9140957, + "num_input_tokens_seen": 99835552, + "router_z_loss_mlp": 0.3449707, + "step": 1201, + "time_per_iteration": 2.639636754989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090282, + "balance_loss_mlp": 1.05556786, + "epoch": 0.23124278568680262, + "flos": 704474855424.0, + "grad_norm": 0.06077351963181601, + "language_loss": 0.80804175, + "learning_rate": 0.0008975128096796121, + "loss": 0.81894457, + "num_input_tokens_seen": 99910784, + "router_z_loss_mlp": 0.34765625, + "step": 1202, + "time_per_iteration": 2.8410260677337646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078695, + "balance_loss_mlp": 1.04431474, + "epoch": 0.23143516737206618, + "flos": 612469002240.0, + "grad_norm": 0.07413481943536562, + "language_loss": 0.85940087, + "learning_rate": 0.0008973237584855794, + "loss": 0.87018776, + "num_input_tokens_seen": 99991120, + "router_z_loss_mlp": 0.34423828, + "step": 1203, + "time_per_iteration": 2.898423671722412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077022, + "balance_loss_mlp": 1.04233205, + "epoch": 0.23162754905732974, + "flos": 389030238720.0, + "grad_norm": 0.06038618944932519, + "language_loss": 0.8201915, + "learning_rate": 0.0008971345530367832, + "loss": 0.83096182, + "num_input_tokens_seen": 100053888, + "router_z_loss_mlp": 0.34716797, + "step": 1204, + "time_per_iteration": 2.486668586730957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082339, + "balance_loss_mlp": 1.04738712, + "epoch": 0.2318199307425933, + "flos": 667481928192.0, + "grad_norm": 0.05427081728260985, + "language_loss": 0.85029405, + "learning_rate": 0.0008969451934066799, + "loss": 0.86111748, + "num_input_tokens_seen": 100124176, + "router_z_loss_mlp": 0.34960938, + "step": 1205, + "time_per_iteration": 2.771306276321411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091653, + "balance_loss_mlp": 1.05662966, + "epoch": 0.23201231242785686, + "flos": 666093860352.0, + "grad_norm": 0.0707913589572404, + "language_loss": 0.80143172, + "learning_rate": 0.0008967556796687854, + "loss": 0.81234825, + "num_input_tokens_seen": 100205296, + "router_z_loss_mlp": 0.35058594, + "step": 1206, + "time_per_iteration": 2.8904309272766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087698, + "balance_loss_mlp": 1.05353224, + "epoch": 0.23220469411312042, + "flos": 748498281984.0, + "grad_norm": 0.05559113870944949, + "language_loss": 0.83954245, + "learning_rate": 0.0008965660118966752, + "loss": 0.8504194, + "num_input_tokens_seen": 100279440, + "router_z_loss_mlp": 0.34204102, + "step": 1207, + "time_per_iteration": 2.9140615463256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108846, + "balance_loss_mlp": 1.05529559, + "epoch": 0.232397075798384, + "flos": 666763163136.0, + "grad_norm": 0.04975334384076733, + "language_loss": 0.90441763, + "learning_rate": 0.0008963761901639851, + "loss": 0.91530222, + "num_input_tokens_seen": 100354512, + "router_z_loss_mlp": 0.33154297, + "step": 1208, + "time_per_iteration": 2.8032286167144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094486, + "balance_loss_mlp": 1.06008244, + "epoch": 0.23258945748364757, + "flos": 609937357824.0, + "grad_norm": 0.05840728669351643, + "language_loss": 0.83201033, + "learning_rate": 0.0008961862145444103, + "loss": 0.84295517, + "num_input_tokens_seen": 100426848, + "router_z_loss_mlp": 0.34399414, + "step": 1209, + "time_per_iteration": 2.7161943912506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094305, + "balance_loss_mlp": 1.05954397, + "epoch": 0.23278183916891113, + "flos": 489397842432.0, + "grad_norm": 0.06743904317466738, + "language_loss": 0.85216832, + "learning_rate": 0.0008959960851117059, + "loss": 0.86311138, + "num_input_tokens_seen": 100496176, + "router_z_loss_mlp": 0.34790039, + "step": 1210, + "time_per_iteration": 2.5817031860351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095047, + "balance_loss_mlp": 1.06014228, + "epoch": 0.23297422085417469, + "flos": 511314232320.0, + "grad_norm": 0.057575168534165826, + "language_loss": 0.84338427, + "learning_rate": 0.0008958058019396868, + "loss": 0.85433477, + "num_input_tokens_seen": 100575072, + "router_z_loss_mlp": 0.34936523, + "step": 1211, + "time_per_iteration": 2.7788164615631104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091213, + "balance_loss_mlp": 1.05730987, + "epoch": 0.23316660253943824, + "flos": 546145072128.0, + "grad_norm": 0.057082370400879795, + "language_loss": 0.86897939, + "learning_rate": 0.0008956153651022274, + "loss": 0.87989151, + "num_input_tokens_seen": 100648304, + "router_z_loss_mlp": 0.33935547, + "step": 1212, + "time_per_iteration": 2.7309062480926514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090204, + "balance_loss_mlp": 1.05608642, + "epoch": 0.2333589842247018, + "flos": 509998947840.0, + "grad_norm": 0.06317396982696966, + "language_loss": 0.84641176, + "learning_rate": 0.0008954247746732618, + "loss": 0.85731381, + "num_input_tokens_seen": 100717616, + "router_z_loss_mlp": 0.34155273, + "step": 1213, + "time_per_iteration": 2.619058609008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084199, + "balance_loss_mlp": 1.05072522, + "epoch": 0.23355136590996536, + "flos": 662834686464.0, + "grad_norm": 0.09780220222501788, + "language_loss": 0.90954423, + "learning_rate": 0.0008952340307267837, + "loss": 0.9203862, + "num_input_tokens_seen": 100797056, + "router_z_loss_mlp": 0.33496094, + "step": 1214, + "time_per_iteration": 2.869351387023926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108593, + "balance_loss_mlp": 1.05128753, + "epoch": 0.23374374759522892, + "flos": 508206417408.0, + "grad_norm": 0.061496426320555984, + "language_loss": 0.83373952, + "learning_rate": 0.0008950431333368468, + "loss": 0.84459883, + "num_input_tokens_seen": 100863632, + "router_z_loss_mlp": 0.34667969, + "step": 1215, + "time_per_iteration": 2.5557806491851807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093645, + "balance_loss_mlp": 1.05928898, + "epoch": 0.2339361292804925, + "flos": 1293964028928.0, + "grad_norm": 0.062331860667319446, + "language_loss": 0.84730738, + "learning_rate": 0.0008948520825775634, + "loss": 0.85824382, + "num_input_tokens_seen": 100950272, + "router_z_loss_mlp": 0.34399414, + "step": 1216, + "time_per_iteration": 3.6050164699554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098293, + "balance_loss_mlp": 1.06343639, + "epoch": 0.23412851096575607, + "flos": 705617021952.0, + "grad_norm": 0.06500023378725601, + "language_loss": 0.84162283, + "learning_rate": 0.0008946608785231067, + "loss": 0.8526057, + "num_input_tokens_seen": 101031008, + "router_z_loss_mlp": 0.34863281, + "step": 1217, + "time_per_iteration": 2.858696699142456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109995, + "balance_loss_mlp": 1.065045, + "epoch": 0.23432089265101963, + "flos": 438036968448.0, + "grad_norm": 0.06356573317347913, + "language_loss": 0.84325957, + "learning_rate": 0.0008944695212477084, + "loss": 0.85425907, + "num_input_tokens_seen": 101094688, + "router_z_loss_mlp": 0.34912109, + "step": 1218, + "time_per_iteration": 2.4787168502807617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107192, + "balance_loss_mlp": 1.07190585, + "epoch": 0.2345132743362832, + "flos": 480697058304.0, + "grad_norm": 0.05460931090439532, + "language_loss": 0.86098325, + "learning_rate": 0.0008942780108256599, + "loss": 0.87205517, + "num_input_tokens_seen": 101163744, + "router_z_loss_mlp": 0.35327148, + "step": 1219, + "time_per_iteration": 2.574692726135254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105521, + "balance_loss_mlp": 1.06966305, + "epoch": 0.23470565602154675, + "flos": 411231817728.0, + "grad_norm": 0.05853057396081394, + "language_loss": 0.86360055, + "learning_rate": 0.0008940863473313121, + "loss": 0.87465572, + "num_input_tokens_seen": 101226480, + "router_z_loss_mlp": 0.35839844, + "step": 1220, + "time_per_iteration": 2.4849462509155273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115361, + "balance_loss_mlp": 1.08024168, + "epoch": 0.2348980377068103, + "flos": 545189170176.0, + "grad_norm": 0.0745659618807548, + "language_loss": 0.87691534, + "learning_rate": 0.0008938945308390756, + "loss": 0.88806891, + "num_input_tokens_seen": 101291824, + "router_z_loss_mlp": 0.3515625, + "step": 1221, + "time_per_iteration": 2.6100285053253174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107677, + "balance_loss_mlp": 1.07248664, + "epoch": 0.23509041939207387, + "flos": 575465900544.0, + "grad_norm": 0.055913245264753976, + "language_loss": 0.87316763, + "learning_rate": 0.00089370256142342, + "loss": 0.88424438, + "num_input_tokens_seen": 101367216, + "router_z_loss_mlp": 0.35205078, + "step": 1222, + "time_per_iteration": 2.726897716522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109541, + "balance_loss_mlp": 1.06007659, + "epoch": 0.23528280107733743, + "flos": 588568025088.0, + "grad_norm": 0.04976165943815558, + "language_loss": 0.85095507, + "learning_rate": 0.0008935104391588746, + "loss": 0.86190915, + "num_input_tokens_seen": 101438992, + "router_z_loss_mlp": 0.35351562, + "step": 1223, + "time_per_iteration": 2.7249879837036133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108966, + "balance_loss_mlp": 1.07313156, + "epoch": 0.235475182762601, + "flos": 822948235776.0, + "grad_norm": 0.05651852634602403, + "language_loss": 0.82749176, + "learning_rate": 0.0008933181641200276, + "loss": 0.83858138, + "num_input_tokens_seen": 101534464, + "router_z_loss_mlp": 0.35839844, + "step": 1224, + "time_per_iteration": 3.1651737689971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094197, + "balance_loss_mlp": 1.06017447, + "epoch": 0.23566756444786457, + "flos": 679865287680.0, + "grad_norm": 0.06356150049585653, + "language_loss": 0.8609674, + "learning_rate": 0.0008931257363815271, + "loss": 0.87190938, + "num_input_tokens_seen": 101616496, + "router_z_loss_mlp": 0.34033203, + "step": 1225, + "time_per_iteration": 2.891789674758911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091395, + "balance_loss_mlp": 1.05711007, + "epoch": 0.23585994613312813, + "flos": 701481931776.0, + "grad_norm": 0.04853721262867189, + "language_loss": 0.89892405, + "learning_rate": 0.0008929331560180798, + "loss": 0.90983796, + "num_input_tokens_seen": 101694496, + "router_z_loss_mlp": 0.34277344, + "step": 1226, + "time_per_iteration": 2.934101104736328 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093973, + "balance_loss_mlp": 1.06002271, + "epoch": 0.2360523278183917, + "flos": 523923144192.0, + "grad_norm": 0.06491881814379113, + "language_loss": 0.91129786, + "learning_rate": 0.0008927404231044525, + "loss": 0.92223763, + "num_input_tokens_seen": 101766160, + "router_z_loss_mlp": 0.33984375, + "step": 1227, + "time_per_iteration": 2.682377815246582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083665, + "balance_loss_mlp": 1.0493325, + "epoch": 0.23624470950365525, + "flos": 524027860992.0, + "grad_norm": 0.053423388326064705, + "language_loss": 0.81944436, + "learning_rate": 0.0008925475377154703, + "loss": 0.83028102, + "num_input_tokens_seen": 101844160, + "router_z_loss_mlp": 0.34375, + "step": 1228, + "time_per_iteration": 2.7511117458343506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077291, + "balance_loss_mlp": 1.04169512, + "epoch": 0.2364370911889188, + "flos": 596525313024.0, + "grad_norm": 0.05836717970983508, + "language_loss": 0.8241868, + "learning_rate": 0.0008923544999260183, + "loss": 0.83495975, + "num_input_tokens_seen": 101917968, + "router_z_loss_mlp": 0.35644531, + "step": 1229, + "time_per_iteration": 2.7915079593658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087923, + "balance_loss_mlp": 1.05194569, + "epoch": 0.23662947287418237, + "flos": 756519588864.0, + "grad_norm": 0.08156392485297027, + "language_loss": 0.91757852, + "learning_rate": 0.00089216130981104, + "loss": 0.92845774, + "num_input_tokens_seen": 101996880, + "router_z_loss_mlp": 0.35986328, + "step": 1230, + "time_per_iteration": 3.037900924682617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089884, + "balance_loss_mlp": 1.0531199, + "epoch": 0.23682185455944593, + "flos": 545907935232.0, + "grad_norm": 0.05473268619072285, + "language_loss": 0.82659578, + "learning_rate": 0.000891967967445539, + "loss": 0.83749461, + "num_input_tokens_seen": 102067936, + "router_z_loss_mlp": 0.36743164, + "step": 1231, + "time_per_iteration": 2.6595497131347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088497, + "balance_loss_mlp": 1.05201924, + "epoch": 0.2370142362447095, + "flos": 661977709056.0, + "grad_norm": 0.04604146434030928, + "language_loss": 0.88502473, + "learning_rate": 0.0008917744729045772, + "loss": 0.89590967, + "num_input_tokens_seen": 102147552, + "router_z_loss_mlp": 0.36499023, + "step": 1232, + "time_per_iteration": 2.851651668548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095833, + "balance_loss_mlp": 1.05868709, + "epoch": 0.23720661792997308, + "flos": 683361598464.0, + "grad_norm": 0.06835104069372223, + "language_loss": 0.84165156, + "learning_rate": 0.0008915808262632757, + "loss": 0.85260987, + "num_input_tokens_seen": 102224480, + "router_z_loss_mlp": 0.37133789, + "step": 1233, + "time_per_iteration": 2.8114235401153564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095976, + "balance_loss_mlp": 1.05892599, + "epoch": 0.23739899961523664, + "flos": 558631738368.0, + "grad_norm": 0.055258261409357204, + "language_loss": 0.92769438, + "learning_rate": 0.0008913870275968148, + "loss": 0.93865418, + "num_input_tokens_seen": 102297392, + "router_z_loss_mlp": 0.37036133, + "step": 1234, + "time_per_iteration": 2.705349922180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092916, + "balance_loss_mlp": 1.05629516, + "epoch": 0.2375913813005002, + "flos": 889144128000.0, + "grad_norm": 0.12876854850300654, + "language_loss": 0.87540263, + "learning_rate": 0.0008911930769804342, + "loss": 0.8863318, + "num_input_tokens_seen": 102386032, + "router_z_loss_mlp": 0.36621094, + "step": 1235, + "time_per_iteration": 3.2342941761016846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091192, + "balance_loss_mlp": 1.05492854, + "epoch": 0.23778376298576376, + "flos": 640810607616.0, + "grad_norm": 0.044375832072417805, + "language_loss": 0.91481459, + "learning_rate": 0.0008909989744894318, + "loss": 0.92572653, + "num_input_tokens_seen": 102463504, + "router_z_loss_mlp": 0.36303711, + "step": 1236, + "time_per_iteration": 2.8858232498168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089597, + "balance_loss_mlp": 1.05333364, + "epoch": 0.23797614467102732, + "flos": 616540073472.0, + "grad_norm": 0.05892762197337364, + "language_loss": 0.81707233, + "learning_rate": 0.0008908047201991649, + "loss": 0.82796836, + "num_input_tokens_seen": 102529632, + "router_z_loss_mlp": 0.36279297, + "step": 1237, + "time_per_iteration": 2.785226583480835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085624, + "balance_loss_mlp": 1.05071974, + "epoch": 0.23816852635629088, + "flos": 623941539840.0, + "grad_norm": 0.051487502947417364, + "language_loss": 0.86561942, + "learning_rate": 0.0008906103141850502, + "loss": 0.87647569, + "num_input_tokens_seen": 102610192, + "router_z_loss_mlp": 0.34960938, + "step": 1238, + "time_per_iteration": 2.868241310119629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095513, + "balance_loss_mlp": 1.05901158, + "epoch": 0.23836090804155444, + "flos": 521180504064.0, + "grad_norm": 0.07170300234131513, + "language_loss": 0.88119614, + "learning_rate": 0.0008904157565225621, + "loss": 0.89215136, + "num_input_tokens_seen": 102681216, + "router_z_loss_mlp": 0.36499023, + "step": 1239, + "time_per_iteration": 2.610048294067383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092952, + "balance_loss_mlp": 1.05716562, + "epoch": 0.238553289726818, + "flos": 1153527616512.0, + "grad_norm": 0.07764557472008667, + "language_loss": 0.82042629, + "learning_rate": 0.000890221047287235, + "loss": 0.83135581, + "num_input_tokens_seen": 102777184, + "router_z_loss_mlp": 0.3581543, + "step": 1240, + "time_per_iteration": 3.547147274017334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102634, + "balance_loss_mlp": 1.06772995, + "epoch": 0.23874567141208156, + "flos": 499600175616.0, + "grad_norm": 0.07123563443936186, + "language_loss": 0.91052604, + "learning_rate": 0.0008900261865546615, + "loss": 0.92155242, + "num_input_tokens_seen": 102845744, + "router_z_loss_mlp": 0.34936523, + "step": 1241, + "time_per_iteration": 2.6277406215667725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110411, + "balance_loss_mlp": 1.06768012, + "epoch": 0.23893805309734514, + "flos": 556657325568.0, + "grad_norm": 0.08027126565183675, + "language_loss": 0.84991688, + "learning_rate": 0.0008898311744004936, + "loss": 0.86095798, + "num_input_tokens_seen": 102918064, + "router_z_loss_mlp": 0.36425781, + "step": 1242, + "time_per_iteration": 2.687009811401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112655, + "balance_loss_mlp": 1.07708287, + "epoch": 0.2391304347826087, + "flos": 549009957888.0, + "grad_norm": 0.05686617926086787, + "language_loss": 0.86918116, + "learning_rate": 0.0008896360109004414, + "loss": 0.88030773, + "num_input_tokens_seen": 102983920, + "router_z_loss_mlp": 0.35595703, + "step": 1243, + "time_per_iteration": 2.6292564868927 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111871, + "balance_loss_mlp": 1.07629931, + "epoch": 0.23932281646787226, + "flos": 515794148352.0, + "grad_norm": 0.05075175877282041, + "language_loss": 0.84481502, + "learning_rate": 0.0008894406961302742, + "loss": 0.85593379, + "num_input_tokens_seen": 103053328, + "router_z_loss_mlp": 0.35595703, + "step": 1244, + "time_per_iteration": 2.5960640907287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122594, + "balance_loss_mlp": 1.08737969, + "epoch": 0.23951519815313582, + "flos": 743353445376.0, + "grad_norm": 0.06488001286924965, + "language_loss": 0.84053004, + "learning_rate": 0.0008892452301658201, + "loss": 0.85175598, + "num_input_tokens_seen": 103128208, + "router_z_loss_mlp": 0.35253906, + "step": 1245, + "time_per_iteration": 2.9320998191833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123528, + "balance_loss_mlp": 1.08728814, + "epoch": 0.23970757983839938, + "flos": 553855048704.0, + "grad_norm": 0.05553543969160018, + "language_loss": 0.83631629, + "learning_rate": 0.0008890496130829653, + "loss": 0.84755158, + "num_input_tokens_seen": 103197392, + "router_z_loss_mlp": 0.36230469, + "step": 1246, + "time_per_iteration": 2.6420071125030518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123934, + "balance_loss_mlp": 1.08802795, + "epoch": 0.23989996152366294, + "flos": 480416251392.0, + "grad_norm": 0.0595921721906752, + "language_loss": 0.85551775, + "learning_rate": 0.0008888538449576555, + "loss": 0.86675715, + "num_input_tokens_seen": 103265328, + "router_z_loss_mlp": 0.359375, + "step": 1247, + "time_per_iteration": 2.544706344604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123543, + "balance_loss_mlp": 1.08687472, + "epoch": 0.2400923432089265, + "flos": 485069285376.0, + "grad_norm": 0.06973867138143126, + "language_loss": 0.82958472, + "learning_rate": 0.0008886579258658944, + "loss": 0.84082007, + "num_input_tokens_seen": 103331632, + "router_z_loss_mlp": 0.36669922, + "step": 1248, + "time_per_iteration": 2.5460424423217773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108744, + "balance_loss_mlp": 1.0724808, + "epoch": 0.24028472489419006, + "flos": 623247505920.0, + "grad_norm": 0.04817062293972818, + "language_loss": 0.85353303, + "learning_rate": 0.0008884618558837446, + "loss": 0.86462045, + "num_input_tokens_seen": 103405408, + "router_z_loss_mlp": 0.36279297, + "step": 1249, + "time_per_iteration": 2.80222487449646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125107, + "balance_loss_mlp": 1.08765173, + "epoch": 0.24047710657945365, + "flos": 601302002688.0, + "grad_norm": 0.052194699096834656, + "language_loss": 0.86387813, + "learning_rate": 0.0008882656350873273, + "loss": 0.87512922, + "num_input_tokens_seen": 103487216, + "router_z_loss_mlp": 0.37426758, + "step": 1250, + "time_per_iteration": 2.830887794494629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136265, + "balance_loss_mlp": 1.09911942, + "epoch": 0.2406694882647172, + "flos": 841199579136.0, + "grad_norm": 0.07156482775024936, + "language_loss": 0.86951184, + "learning_rate": 0.0008880692635528219, + "loss": 0.88087451, + "num_input_tokens_seen": 103568640, + "router_z_loss_mlp": 0.37109375, + "step": 1251, + "time_per_iteration": 3.0439021587371826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140454, + "balance_loss_mlp": 1.10450029, + "epoch": 0.24086186994998077, + "flos": 526789440000.0, + "grad_norm": 0.062254670736574515, + "language_loss": 0.89187038, + "learning_rate": 0.0008878727413564669, + "loss": 0.90327489, + "num_input_tokens_seen": 103640784, + "router_z_loss_mlp": 0.36010742, + "step": 1252, + "time_per_iteration": 2.7363240718841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094248, + "balance_loss_mlp": 1.07717752, + "epoch": 0.24105425163524433, + "flos": 1337464673280.0, + "grad_norm": 0.032126183170312766, + "language_loss": 0.80135596, + "learning_rate": 0.0008876760685745588, + "loss": 0.81229842, + "num_input_tokens_seen": 103865824, + "router_z_loss_mlp": 0.17089844, + "step": 1253, + "time_per_iteration": 4.8370680809021 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01175937, + "balance_loss_mlp": 1.13755202, + "epoch": 0.24124663332050789, + "flos": 613822164480.0, + "grad_norm": 0.06436318886622608, + "language_loss": 0.78452635, + "learning_rate": 0.0008874792452834528, + "loss": 0.79628575, + "num_input_tokens_seen": 103939872, + "router_z_loss_mlp": 0.38354492, + "step": 1254, + "time_per_iteration": 2.724947452545166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151033, + "balance_loss_mlp": 1.11381602, + "epoch": 0.24143901500577145, + "flos": 575278225920.0, + "grad_norm": 0.08996846201845816, + "language_loss": 0.87516546, + "learning_rate": 0.0008872822715595626, + "loss": 0.88667583, + "num_input_tokens_seen": 104011120, + "router_z_loss_mlp": 0.37207031, + "step": 1255, + "time_per_iteration": 2.676539659500122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118396, + "balance_loss_mlp": 1.08275259, + "epoch": 0.241631396691035, + "flos": 494941349376.0, + "grad_norm": 0.06475486920780314, + "language_loss": 0.87080252, + "learning_rate": 0.0008870851474793598, + "loss": 0.88198644, + "num_input_tokens_seen": 104077040, + "router_z_loss_mlp": 0.35668945, + "step": 1256, + "time_per_iteration": 2.5523862838745117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108714, + "balance_loss_mlp": 1.07287991, + "epoch": 0.24182377837629856, + "flos": 635891323392.0, + "grad_norm": 0.0627898868455093, + "language_loss": 0.89724898, + "learning_rate": 0.0008868878731193752, + "loss": 0.90833616, + "num_input_tokens_seen": 104150880, + "router_z_loss_mlp": 0.35888672, + "step": 1257, + "time_per_iteration": 2.8152451515197754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094242, + "balance_loss_mlp": 1.05933762, + "epoch": 0.24201616006156215, + "flos": 514938580992.0, + "grad_norm": 0.06450256361572139, + "language_loss": 0.89708877, + "learning_rate": 0.0008866904485561973, + "loss": 0.90803117, + "num_input_tokens_seen": 104223696, + "router_z_loss_mlp": 0.34936523, + "step": 1258, + "time_per_iteration": 2.7304298877716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095529, + "balance_loss_mlp": 1.05945659, + "epoch": 0.2422085417468257, + "flos": 614837703168.0, + "grad_norm": 0.05809143078881904, + "language_loss": 0.83024096, + "learning_rate": 0.000886492873866473, + "loss": 0.8411963, + "num_input_tokens_seen": 104301728, + "router_z_loss_mlp": 0.36108398, + "step": 1259, + "time_per_iteration": 2.828904628753662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090865, + "balance_loss_mlp": 1.05576968, + "epoch": 0.24240092343208927, + "flos": 585515464704.0, + "grad_norm": 0.07568124142212555, + "language_loss": 0.84760439, + "learning_rate": 0.000886295149126908, + "loss": 0.85851306, + "num_input_tokens_seen": 104374480, + "router_z_loss_mlp": 0.35131836, + "step": 1260, + "time_per_iteration": 2.7011313438415527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109068, + "balance_loss_mlp": 1.05537009, + "epoch": 0.24259330511735283, + "flos": 761930675712.0, + "grad_norm": 0.05459059834864095, + "language_loss": 0.85652769, + "learning_rate": 0.0008860972744142655, + "loss": 0.8674345, + "num_input_tokens_seen": 104452384, + "router_z_loss_mlp": 0.35327148, + "step": 1261, + "time_per_iteration": 2.9039082527160645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084953, + "balance_loss_mlp": 1.05028725, + "epoch": 0.2427856868026164, + "flos": 626566316544.0, + "grad_norm": 0.06267274795834049, + "language_loss": 0.8183161, + "learning_rate": 0.0008858992498053671, + "loss": 0.82916564, + "num_input_tokens_seen": 104532576, + "router_z_loss_mlp": 0.34692383, + "step": 1262, + "time_per_iteration": 2.8293697834014893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080455, + "balance_loss_mlp": 1.06586385, + "epoch": 0.24297806848787995, + "flos": 1510840470528.0, + "grad_norm": 0.02756761643082338, + "language_loss": 0.7658875, + "learning_rate": 0.0008857010753770934, + "loss": 0.77669203, + "num_input_tokens_seen": 104765216, + "router_z_loss_mlp": 0.14550781, + "step": 1263, + "time_per_iteration": 4.8116748332977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087756, + "balance_loss_mlp": 1.05328119, + "epoch": 0.2431704501731435, + "flos": 541669538304.0, + "grad_norm": 0.05501719814044903, + "language_loss": 0.83684969, + "learning_rate": 0.0008855027512063817, + "loss": 0.8477273, + "num_input_tokens_seen": 104836912, + "router_z_loss_mlp": 0.3449707, + "step": 1264, + "time_per_iteration": 2.6995394229888916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084053, + "balance_loss_mlp": 1.0493865, + "epoch": 0.24336283185840707, + "flos": 523588492800.0, + "grad_norm": 0.06804757776515359, + "language_loss": 0.85974693, + "learning_rate": 0.0008853042773702292, + "loss": 0.87058747, + "num_input_tokens_seen": 104909280, + "router_z_loss_mlp": 0.34692383, + "step": 1265, + "time_per_iteration": 2.683969497680664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086337, + "balance_loss_mlp": 1.05002618, + "epoch": 0.24355521354367063, + "flos": 536839004160.0, + "grad_norm": 0.05444938358074035, + "language_loss": 0.87678754, + "learning_rate": 0.0008851056539456896, + "loss": 0.88765097, + "num_input_tokens_seen": 104982560, + "router_z_loss_mlp": 0.36303711, + "step": 1266, + "time_per_iteration": 2.674891471862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010835, + "balance_loss_mlp": 1.04830909, + "epoch": 0.24374759522893422, + "flos": 930050975232.0, + "grad_norm": 0.04940136823280911, + "language_loss": 0.82195789, + "learning_rate": 0.0008849068810098755, + "loss": 0.83279288, + "num_input_tokens_seen": 105075056, + "router_z_loss_mlp": 0.35229492, + "step": 1267, + "time_per_iteration": 3.27172589302063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083691, + "balance_loss_mlp": 1.04859626, + "epoch": 0.24393997691419778, + "flos": 427564002816.0, + "grad_norm": 0.07591960175092535, + "language_loss": 0.83287823, + "learning_rate": 0.0008847079586399575, + "loss": 0.84371519, + "num_input_tokens_seen": 105137536, + "router_z_loss_mlp": 0.35131836, + "step": 1268, + "time_per_iteration": 2.4539763927459717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010876, + "balance_loss_mlp": 1.05281472, + "epoch": 0.24413235859946134, + "flos": 578582479872.0, + "grad_norm": 0.059755639557228325, + "language_loss": 0.86095846, + "learning_rate": 0.0008845088869131641, + "loss": 0.87183452, + "num_input_tokens_seen": 105204848, + "router_z_loss_mlp": 0.34790039, + "step": 1269, + "time_per_iteration": 2.651010274887085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094661, + "balance_loss_mlp": 1.058851, + "epoch": 0.2443247402847249, + "flos": 529600481280.0, + "grad_norm": 0.07776240560166553, + "language_loss": 0.89366186, + "learning_rate": 0.0008843096659067818, + "loss": 0.90460849, + "num_input_tokens_seen": 105273456, + "router_z_loss_mlp": 0.35839844, + "step": 1270, + "time_per_iteration": 2.61082124710083 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108773, + "balance_loss_mlp": 1.05292046, + "epoch": 0.24451712196998845, + "flos": 695996651520.0, + "grad_norm": 0.05083497592617014, + "language_loss": 0.86395383, + "learning_rate": 0.000884110295698155, + "loss": 0.87483108, + "num_input_tokens_seen": 105355488, + "router_z_loss_mlp": 0.34863281, + "step": 1271, + "time_per_iteration": 2.930372476577759 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085133, + "balance_loss_mlp": 1.05089653, + "epoch": 0.24470950365525201, + "flos": 529575750144.0, + "grad_norm": 0.05520811698213447, + "language_loss": 0.86009014, + "learning_rate": 0.0008839107763646861, + "loss": 0.87094152, + "num_input_tokens_seen": 105421568, + "router_z_loss_mlp": 0.34277344, + "step": 1272, + "time_per_iteration": 2.576322078704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088964, + "balance_loss_mlp": 1.05324888, + "epoch": 0.24490188534051557, + "flos": 491091448320.0, + "grad_norm": 0.0616556586287024, + "language_loss": 0.9024111, + "learning_rate": 0.0008837111079838353, + "loss": 0.91330075, + "num_input_tokens_seen": 105493072, + "router_z_loss_mlp": 0.35742188, + "step": 1273, + "time_per_iteration": 2.6859118938446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109117, + "balance_loss_mlp": 1.05631351, + "epoch": 0.24509426702577913, + "flos": 473916842496.0, + "grad_norm": 0.05704478566457949, + "language_loss": 0.89869869, + "learning_rate": 0.000883511290633121, + "loss": 0.90961039, + "num_input_tokens_seen": 105559840, + "router_z_loss_mlp": 0.34887695, + "step": 1274, + "time_per_iteration": 2.5262861251831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096408, + "balance_loss_mlp": 1.06152773, + "epoch": 0.24528664871104272, + "flos": 550329624576.0, + "grad_norm": 0.04914382449005864, + "language_loss": 0.92288065, + "learning_rate": 0.000883311324390119, + "loss": 0.93384475, + "num_input_tokens_seen": 105634448, + "router_z_loss_mlp": 0.34887695, + "step": 1275, + "time_per_iteration": 2.6791441440582275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100279, + "balance_loss_mlp": 1.0631578, + "epoch": 0.24547903039630628, + "flos": 825546871296.0, + "grad_norm": 0.0705624444694786, + "language_loss": 0.81542301, + "learning_rate": 0.0008831112093324629, + "loss": 0.82642579, + "num_input_tokens_seen": 105711936, + "router_z_loss_mlp": 0.37060547, + "step": 1276, + "time_per_iteration": 3.0612823963165283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100002, + "balance_loss_mlp": 1.06419206, + "epoch": 0.24567141208156984, + "flos": 591325221888.0, + "grad_norm": 0.0822852621967946, + "language_loss": 0.89184481, + "learning_rate": 0.0008829109455378444, + "loss": 0.90284485, + "num_input_tokens_seen": 105780240, + "router_z_loss_mlp": 0.35839844, + "step": 1277, + "time_per_iteration": 2.6601858139038086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091196, + "balance_loss_mlp": 1.05567181, + "epoch": 0.2458637937668334, + "flos": 547611715584.0, + "grad_norm": 0.05101212903881184, + "language_loss": 0.86474031, + "learning_rate": 0.000882710533084013, + "loss": 0.87565225, + "num_input_tokens_seen": 105849840, + "router_z_loss_mlp": 0.35546875, + "step": 1278, + "time_per_iteration": 2.6333553791046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087707, + "balance_loss_mlp": 1.05158627, + "epoch": 0.24605617545209696, + "flos": 515641379328.0, + "grad_norm": 0.04855931692812416, + "language_loss": 0.89387107, + "learning_rate": 0.0008825099720487755, + "loss": 0.9047482, + "num_input_tokens_seen": 105921488, + "router_z_loss_mlp": 0.36108398, + "step": 1279, + "time_per_iteration": 2.6388816833496094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071735, + "balance_loss_mlp": 1.05943298, + "epoch": 0.24624855713736052, + "flos": 1510953951744.0, + "grad_norm": 0.03612446278815301, + "language_loss": 0.7526114, + "learning_rate": 0.0008823092625099967, + "loss": 0.76332873, + "num_input_tokens_seen": 106146816, + "router_z_loss_mlp": 0.12304688, + "step": 1280, + "time_per_iteration": 4.837193727493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01044143, + "balance_loss_mlp": 1.03145874, + "epoch": 0.24644093882262408, + "flos": 1526826419712.0, + "grad_norm": 0.020354868078157083, + "language_loss": 0.77944112, + "learning_rate": 0.0008821084045455987, + "loss": 0.78988254, + "num_input_tokens_seen": 106361568, + "router_z_loss_mlp": 0.12695312, + "step": 1281, + "time_per_iteration": 4.7473485469818115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078971, + "balance_loss_mlp": 1.04418564, + "epoch": 0.24663332050788764, + "flos": 658811667456.0, + "grad_norm": 0.060866999123497585, + "language_loss": 0.89327228, + "learning_rate": 0.0008819073982335619, + "loss": 0.90406203, + "num_input_tokens_seen": 106435296, + "router_z_loss_mlp": 0.34838867, + "step": 1282, + "time_per_iteration": 2.839691162109375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107951, + "balance_loss_mlp": 1.0446527, + "epoch": 0.24682570219315123, + "flos": 541510977024.0, + "grad_norm": 0.05752783194209404, + "language_loss": 0.84339237, + "learning_rate": 0.0008817062436519235, + "loss": 0.85418749, + "num_input_tokens_seen": 106507184, + "router_z_loss_mlp": 0.34887695, + "step": 1283, + "time_per_iteration": 2.6106019020080566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080546, + "balance_loss_mlp": 1.04459274, + "epoch": 0.24701808387841478, + "flos": 440455131648.0, + "grad_norm": 0.05999718389674832, + "language_loss": 0.89926815, + "learning_rate": 0.0008815049408787788, + "loss": 0.91007358, + "num_input_tokens_seen": 106571472, + "router_z_loss_mlp": 0.36010742, + "step": 1284, + "time_per_iteration": 2.5186686515808105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079282, + "balance_loss_mlp": 1.04518795, + "epoch": 0.24721046556367834, + "flos": 467826278400.0, + "grad_norm": 0.054777388157378364, + "language_loss": 0.8565737, + "learning_rate": 0.0008813034899922805, + "loss": 0.86736655, + "num_input_tokens_seen": 106638368, + "router_z_loss_mlp": 0.34106445, + "step": 1285, + "time_per_iteration": 2.5217878818511963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082636, + "balance_loss_mlp": 1.04730225, + "epoch": 0.2474028472489419, + "flos": 504183398400.0, + "grad_norm": 0.06351521025868076, + "language_loss": 0.90182853, + "learning_rate": 0.0008811018910706387, + "loss": 0.91265488, + "num_input_tokens_seen": 106705312, + "router_z_loss_mlp": 0.35375977, + "step": 1286, + "time_per_iteration": 2.549523115158081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010823, + "balance_loss_mlp": 1.04787278, + "epoch": 0.24759522893420546, + "flos": 479707660800.0, + "grad_norm": 0.06857789842871208, + "language_loss": 0.81978023, + "learning_rate": 0.0008809001441921211, + "loss": 0.83060318, + "num_input_tokens_seen": 106778624, + "router_z_loss_mlp": 0.34448242, + "step": 1287, + "time_per_iteration": 2.7147598266601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083728, + "balance_loss_mlp": 1.04984844, + "epoch": 0.24778761061946902, + "flos": 533446000128.0, + "grad_norm": 0.05733880184845353, + "language_loss": 0.85523212, + "learning_rate": 0.0008806982494350528, + "loss": 0.86606944, + "num_input_tokens_seen": 106847744, + "router_z_loss_mlp": 0.33911133, + "step": 1288, + "time_per_iteration": 2.6304967403411865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086884, + "balance_loss_mlp": 1.05197978, + "epoch": 0.24797999230473258, + "flos": 559513446912.0, + "grad_norm": 0.04849910181782432, + "language_loss": 0.90370154, + "learning_rate": 0.0008804962068778161, + "loss": 0.91457039, + "num_input_tokens_seen": 106927584, + "router_z_loss_mlp": 0.34936523, + "step": 1289, + "time_per_iteration": 2.8194985389709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087315, + "balance_loss_mlp": 1.05291104, + "epoch": 0.24817237398999614, + "flos": 623912426496.0, + "grad_norm": 0.05410640942937228, + "language_loss": 0.80728722, + "learning_rate": 0.0008802940165988511, + "loss": 0.81816041, + "num_input_tokens_seen": 107006656, + "router_z_loss_mlp": 0.34423828, + "step": 1290, + "time_per_iteration": 2.8703298568725586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096846, + "balance_loss_mlp": 1.06225193, + "epoch": 0.2483647556752597, + "flos": 611981581824.0, + "grad_norm": 0.06277561181530684, + "language_loss": 0.88376027, + "learning_rate": 0.000880091678676655, + "loss": 0.89472872, + "num_input_tokens_seen": 107084352, + "router_z_loss_mlp": 0.34619141, + "step": 1291, + "time_per_iteration": 2.7943451404571533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088363, + "balance_loss_mlp": 1.05496097, + "epoch": 0.2485571373605233, + "flos": 583270419456.0, + "grad_norm": 0.061640996967182685, + "language_loss": 0.89207399, + "learning_rate": 0.0008798891931897821, + "loss": 0.90295762, + "num_input_tokens_seen": 107158368, + "router_z_loss_mlp": 0.33422852, + "step": 1292, + "time_per_iteration": 2.7013609409332275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088304, + "balance_loss_mlp": 1.05463946, + "epoch": 0.24874951904578685, + "flos": 494503391232.0, + "grad_norm": 0.0568342609101268, + "language_loss": 0.84605837, + "learning_rate": 0.0008796865602168447, + "loss": 0.8569414, + "num_input_tokens_seen": 107224256, + "router_z_loss_mlp": 0.33691406, + "step": 1293, + "time_per_iteration": 2.517571210861206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108751, + "balance_loss_mlp": 1.05448937, + "epoch": 0.2489419007310504, + "flos": 455925957120.0, + "grad_norm": 0.05011975537228715, + "language_loss": 0.88745099, + "learning_rate": 0.0008794837798365115, + "loss": 0.8983261, + "num_input_tokens_seen": 107292720, + "router_z_loss_mlp": 0.33032227, + "step": 1294, + "time_per_iteration": 2.6243135929107666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093031, + "balance_loss_mlp": 1.05958056, + "epoch": 0.24913428241631397, + "flos": 485198733312.0, + "grad_norm": 0.05031013210073455, + "language_loss": 0.88537574, + "learning_rate": 0.0008792808521275089, + "loss": 0.89630604, + "num_input_tokens_seen": 107368576, + "router_z_loss_mlp": 0.3347168, + "step": 1295, + "time_per_iteration": 2.743821144104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090644, + "balance_loss_mlp": 1.05664551, + "epoch": 0.24932666410157753, + "flos": 518654651904.0, + "grad_norm": 0.0628198177294759, + "language_loss": 0.87554896, + "learning_rate": 0.0008790777771686206, + "loss": 0.8864553, + "num_input_tokens_seen": 107433856, + "router_z_loss_mlp": 0.34033203, + "step": 1296, + "time_per_iteration": 2.55996036529541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091195, + "balance_loss_mlp": 1.05819798, + "epoch": 0.2495190457868411, + "flos": 472365831168.0, + "grad_norm": 0.05367084005526609, + "language_loss": 0.85479438, + "learning_rate": 0.0008788745550386872, + "loss": 0.86570632, + "num_input_tokens_seen": 107500944, + "router_z_loss_mlp": 0.33007812, + "step": 1297, + "time_per_iteration": 2.555238723754883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099543, + "balance_loss_mlp": 1.06494844, + "epoch": 0.24971142747210465, + "flos": 745559202816.0, + "grad_norm": 0.05557204977607519, + "language_loss": 0.80045742, + "learning_rate": 0.0008786711858166063, + "loss": 0.81145287, + "num_input_tokens_seen": 107580000, + "router_z_loss_mlp": 0.34643555, + "step": 1298, + "time_per_iteration": 2.940908670425415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090601, + "balance_loss_mlp": 1.05757999, + "epoch": 0.2499038091573682, + "flos": 749222839296.0, + "grad_norm": 0.08262860681241094, + "language_loss": 0.83490336, + "learning_rate": 0.0008784676695813332, + "loss": 0.84580934, + "num_input_tokens_seen": 107660384, + "router_z_loss_mlp": 0.33032227, + "step": 1299, + "time_per_iteration": 2.966646432876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092772, + "balance_loss_mlp": 1.05870187, + "epoch": 0.2500961908426318, + "flos": 744741513216.0, + "grad_norm": 0.04756275395178792, + "language_loss": 0.84761405, + "learning_rate": 0.0008782640064118796, + "loss": 0.85854173, + "num_input_tokens_seen": 107736320, + "router_z_loss_mlp": 0.34082031, + "step": 1300, + "time_per_iteration": 2.8889827728271484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078115, + "balance_loss_mlp": 1.06447709, + "epoch": 0.2502885725278953, + "flos": 1416652180992.0, + "grad_norm": 0.036683670441934005, + "language_loss": 0.7618475, + "learning_rate": 0.0008780601963873149, + "loss": 0.77262866, + "num_input_tokens_seen": 107972608, + "router_z_loss_mlp": 0.13671875, + "step": 1301, + "time_per_iteration": 4.988169431686401 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094633, + "balance_loss_mlp": 1.06196928, + "epoch": 0.2504809542131589, + "flos": 514961902080.0, + "grad_norm": 0.05923567857946263, + "language_loss": 0.86476314, + "learning_rate": 0.0008778562395867648, + "loss": 0.87570941, + "num_input_tokens_seen": 108043312, + "router_z_loss_mlp": 0.32666016, + "step": 1302, + "time_per_iteration": 2.5900919437408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087269, + "balance_loss_mlp": 1.05436766, + "epoch": 0.25067333589842244, + "flos": 525562905600.0, + "grad_norm": 0.06049595368492962, + "language_loss": 0.83774143, + "learning_rate": 0.0008776521360894127, + "loss": 0.8486141, + "num_input_tokens_seen": 108114144, + "router_z_loss_mlp": 0.32910156, + "step": 1303, + "time_per_iteration": 2.6029298305511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035824, + "balance_loss_mlp": 1.02275884, + "epoch": 0.25086571758368603, + "flos": 1473085108224.0, + "grad_norm": 0.024331867442101186, + "language_loss": 0.78962064, + "learning_rate": 0.0008774478859744984, + "loss": 0.79997885, + "num_input_tokens_seen": 108338720, + "router_z_loss_mlp": 0.13085938, + "step": 1304, + "time_per_iteration": 4.800757646560669 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096353, + "balance_loss_mlp": 1.063833, + "epoch": 0.2510580992689496, + "flos": 528128045568.0, + "grad_norm": 0.053799887970574674, + "language_loss": 0.90341735, + "learning_rate": 0.0008772434893213186, + "loss": 0.91438091, + "num_input_tokens_seen": 108405456, + "router_z_loss_mlp": 0.32519531, + "step": 1305, + "time_per_iteration": 2.5816421508789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104465, + "balance_loss_mlp": 1.07123005, + "epoch": 0.25125048095421315, + "flos": 517192390656.0, + "grad_norm": 0.058690449713219205, + "language_loss": 0.84433925, + "learning_rate": 0.0008770389462092276, + "loss": 0.85538393, + "num_input_tokens_seen": 108474368, + "router_z_loss_mlp": 0.33251953, + "step": 1306, + "time_per_iteration": 2.6747090816497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106931, + "balance_loss_mlp": 1.07309926, + "epoch": 0.25144286263947674, + "flos": 620160039936.0, + "grad_norm": 0.16660488736040688, + "language_loss": 0.86719346, + "learning_rate": 0.0008768342567176357, + "loss": 0.87826276, + "num_input_tokens_seen": 108548864, + "router_z_loss_mlp": 0.33862305, + "step": 1307, + "time_per_iteration": 2.7788002490997314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098824, + "balance_loss_mlp": 1.06425333, + "epoch": 0.25163524432474027, + "flos": 503534444544.0, + "grad_norm": 0.04824933548887647, + "language_loss": 0.90589297, + "learning_rate": 0.0008766294209260107, + "loss": 0.91688126, + "num_input_tokens_seen": 108623072, + "router_z_loss_mlp": 0.34619141, + "step": 1308, + "time_per_iteration": 2.6300241947174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010929, + "balance_loss_mlp": 1.05852032, + "epoch": 0.25182762601000386, + "flos": 508821875712.0, + "grad_norm": 0.0633327884934456, + "language_loss": 0.91549027, + "learning_rate": 0.0008764244389138767, + "loss": 0.92641926, + "num_input_tokens_seen": 108690128, + "router_z_loss_mlp": 0.34399414, + "step": 1309, + "time_per_iteration": 2.574056625366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088516, + "balance_loss_mlp": 1.05306351, + "epoch": 0.2520200076952674, + "flos": 633596815872.0, + "grad_norm": 0.05898934519456769, + "language_loss": 0.8269434, + "learning_rate": 0.000876219310760815, + "loss": 0.83782852, + "num_input_tokens_seen": 108770272, + "router_z_loss_mlp": 0.35449219, + "step": 1310, + "time_per_iteration": 2.87404465675354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010989, + "balance_loss_mlp": 1.06299448, + "epoch": 0.252212389380531, + "flos": 494385527808.0, + "grad_norm": 0.05968729718727878, + "language_loss": 0.8144334, + "learning_rate": 0.0008760140365464631, + "loss": 0.82542241, + "num_input_tokens_seen": 108840592, + "router_z_loss_mlp": 0.35913086, + "step": 1311, + "time_per_iteration": 2.599480390548706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105508, + "balance_loss_mlp": 1.07062793, + "epoch": 0.2524047710657945, + "flos": 490298489856.0, + "grad_norm": 0.06557576312810307, + "language_loss": 0.87226975, + "learning_rate": 0.0008758086163505156, + "loss": 0.88332486, + "num_input_tokens_seen": 108910064, + "router_z_loss_mlp": 0.34912109, + "step": 1312, + "time_per_iteration": 2.6165666580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112941, + "balance_loss_mlp": 1.07698762, + "epoch": 0.2525971527510581, + "flos": 647136898560.0, + "grad_norm": 0.06425852435188892, + "language_loss": 0.89039612, + "learning_rate": 0.0008756030502527239, + "loss": 0.90152562, + "num_input_tokens_seen": 108986336, + "router_z_loss_mlp": 0.36010742, + "step": 1313, + "time_per_iteration": 2.794595956802368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112418, + "balance_loss_mlp": 1.07636952, + "epoch": 0.2527895344363217, + "flos": 568991222784.0, + "grad_norm": 0.05792474282671988, + "language_loss": 0.90396988, + "learning_rate": 0.0008753973383328954, + "loss": 0.91509414, + "num_input_tokens_seen": 109059712, + "router_z_loss_mlp": 0.36108398, + "step": 1314, + "time_per_iteration": 2.66343355178833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110344, + "balance_loss_mlp": 1.07491553, + "epoch": 0.2529819161215852, + "flos": 513795004416.0, + "grad_norm": 0.10488361484557306, + "language_loss": 0.84231019, + "learning_rate": 0.0008751914806708952, + "loss": 0.8534137, + "num_input_tokens_seen": 109127504, + "router_z_loss_mlp": 0.35449219, + "step": 1315, + "time_per_iteration": 2.5714006423950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099121, + "balance_loss_mlp": 1.06357241, + "epoch": 0.2531742978068488, + "flos": 530979784704.0, + "grad_norm": 0.0646255116041034, + "language_loss": 0.81763697, + "learning_rate": 0.0008749854773466439, + "loss": 0.82862812, + "num_input_tokens_seen": 109198080, + "router_z_loss_mlp": 0.35571289, + "step": 1316, + "time_per_iteration": 2.6507568359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093995, + "balance_loss_mlp": 1.05892396, + "epoch": 0.25336667949211233, + "flos": 596362369536.0, + "grad_norm": 0.11519177634747009, + "language_loss": 0.84297431, + "learning_rate": 0.0008747793284401192, + "loss": 0.8539142, + "num_input_tokens_seen": 109268368, + "router_z_loss_mlp": 0.35107422, + "step": 1317, + "time_per_iteration": 2.6708261966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109538, + "balance_loss_mlp": 1.05966473, + "epoch": 0.2535590611773759, + "flos": 601764691968.0, + "grad_norm": 0.05376009268762157, + "language_loss": 0.86145389, + "learning_rate": 0.0008745730340313551, + "loss": 0.87240773, + "num_input_tokens_seen": 109344112, + "router_z_loss_mlp": 0.35742188, + "step": 1318, + "time_per_iteration": 2.7465810775756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100043, + "balance_loss_mlp": 1.06504369, + "epoch": 0.25375144286263945, + "flos": 495079561728.0, + "grad_norm": 0.053440140598651036, + "language_loss": 0.8468703, + "learning_rate": 0.0008743665942004422, + "loss": 0.85787076, + "num_input_tokens_seen": 109414112, + "router_z_loss_mlp": 0.35009766, + "step": 1319, + "time_per_iteration": 2.632645606994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109509, + "balance_loss_mlp": 1.05908918, + "epoch": 0.25394382454790304, + "flos": 512219261952.0, + "grad_norm": 0.050076364746318706, + "language_loss": 0.92529714, + "learning_rate": 0.0008741600090275277, + "loss": 0.93624806, + "num_input_tokens_seen": 109484336, + "router_z_loss_mlp": 0.35986328, + "step": 1320, + "time_per_iteration": 2.564730405807495 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097426, + "balance_loss_mlp": 1.06092453, + "epoch": 0.25413620623316663, + "flos": 958586047488.0, + "grad_norm": 0.058049172943507095, + "language_loss": 0.83939385, + "learning_rate": 0.0008739532785928151, + "loss": 0.85036814, + "num_input_tokens_seen": 109590128, + "router_z_loss_mlp": 0.36474609, + "step": 1321, + "time_per_iteration": 3.4496617317199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056798, + "balance_loss_mlp": 1.04439986, + "epoch": 0.25432858791843016, + "flos": 1576445635584.0, + "grad_norm": 0.03297734471592195, + "language_loss": 0.74893582, + "learning_rate": 0.0008737464029765639, + "loss": 0.75950378, + "num_input_tokens_seen": 109816592, + "router_z_loss_mlp": 0.12353516, + "step": 1322, + "time_per_iteration": 4.803644418716431 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102711, + "balance_loss_mlp": 1.06706691, + "epoch": 0.25452096960369375, + "flos": 583530877440.0, + "grad_norm": 0.056711392027496164, + "language_loss": 0.83213425, + "learning_rate": 0.0008735393822590908, + "loss": 0.84316134, + "num_input_tokens_seen": 109890464, + "router_z_loss_mlp": 0.35668945, + "step": 1323, + "time_per_iteration": 2.6643528938293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099364, + "balance_loss_mlp": 1.06434083, + "epoch": 0.2547133512889573, + "flos": 508344629760.0, + "grad_norm": 0.06006943476027706, + "language_loss": 0.87018919, + "learning_rate": 0.0008733322165207681, + "loss": 0.88118285, + "num_input_tokens_seen": 109963408, + "router_z_loss_mlp": 0.35083008, + "step": 1324, + "time_per_iteration": 2.627495765686035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110409, + "balance_loss_mlp": 1.06863689, + "epoch": 0.25490573297422087, + "flos": 782266940928.0, + "grad_norm": 0.05604709920606865, + "language_loss": 0.83055937, + "learning_rate": 0.0008731249058420247, + "loss": 0.8416003, + "num_input_tokens_seen": 110048800, + "router_z_loss_mlp": 0.35498047, + "step": 1325, + "time_per_iteration": 3.0361831188201904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100095, + "balance_loss_mlp": 1.06468964, + "epoch": 0.2550981146594844, + "flos": 509610451968.0, + "grad_norm": 0.06314633870869373, + "language_loss": 0.90780556, + "learning_rate": 0.0008729174503033459, + "loss": 0.91880649, + "num_input_tokens_seen": 110118096, + "router_z_loss_mlp": 0.35424805, + "step": 1326, + "time_per_iteration": 2.639625072479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109522, + "balance_loss_mlp": 1.06007695, + "epoch": 0.255290496344748, + "flos": 676360212480.0, + "grad_norm": 0.06489195741671011, + "language_loss": 0.82650065, + "learning_rate": 0.0008727098499852728, + "loss": 0.83745289, + "num_input_tokens_seen": 110190160, + "router_z_loss_mlp": 0.35180664, + "step": 1327, + "time_per_iteration": 2.830500602722168 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109753, + "balance_loss_mlp": 1.06231546, + "epoch": 0.2554828780300115, + "flos": 537524273664.0, + "grad_norm": 0.06666455638552511, + "language_loss": 0.89945138, + "learning_rate": 0.0008725021049684034, + "loss": 0.91042662, + "num_input_tokens_seen": 110268000, + "router_z_loss_mlp": 0.35253906, + "step": 1328, + "time_per_iteration": 2.747800350189209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097741, + "balance_loss_mlp": 1.06240726, + "epoch": 0.2556752597152751, + "flos": 823828534272.0, + "grad_norm": 0.052131047599379726, + "language_loss": 0.82919741, + "learning_rate": 0.000872294215333391, + "loss": 0.84017479, + "num_input_tokens_seen": 110354816, + "router_z_loss_mlp": 0.35400391, + "step": 1329, + "time_per_iteration": 3.1658926010131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096694, + "balance_loss_mlp": 1.06176591, + "epoch": 0.2558676414005387, + "flos": 570517502976.0, + "grad_norm": 0.05425014800623288, + "language_loss": 0.82993001, + "learning_rate": 0.0008720861811609457, + "loss": 0.84089696, + "num_input_tokens_seen": 110427968, + "router_z_loss_mlp": 0.34985352, + "step": 1330, + "time_per_iteration": 2.709085702896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101009, + "balance_loss_mlp": 1.06448317, + "epoch": 0.2560600230858022, + "flos": 486419475456.0, + "grad_norm": 0.05425594622111712, + "language_loss": 0.83756936, + "learning_rate": 0.0008718780025318338, + "loss": 0.84857947, + "num_input_tokens_seen": 110501184, + "router_z_loss_mlp": 0.36523438, + "step": 1331, + "time_per_iteration": 2.7126388549804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097427, + "balance_loss_mlp": 1.06280875, + "epoch": 0.2562524047710658, + "flos": 512874008064.0, + "grad_norm": 0.06594145834934585, + "language_loss": 0.8406449, + "learning_rate": 0.0008716696795268771, + "loss": 0.85161918, + "num_input_tokens_seen": 110573008, + "router_z_loss_mlp": 0.34667969, + "step": 1332, + "time_per_iteration": 2.6650350093841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089329, + "balance_loss_mlp": 1.05308938, + "epoch": 0.25644478645632934, + "flos": 634498873344.0, + "grad_norm": 0.051413439896644035, + "language_loss": 0.85076845, + "learning_rate": 0.0008714612122269538, + "loss": 0.86166173, + "num_input_tokens_seen": 110646704, + "router_z_loss_mlp": 0.36279297, + "step": 1333, + "time_per_iteration": 2.8611392974853516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109443, + "balance_loss_mlp": 1.05754697, + "epoch": 0.25663716814159293, + "flos": 436353537024.0, + "grad_norm": 0.0705935369031189, + "language_loss": 0.89120972, + "learning_rate": 0.0008712526007129982, + "loss": 0.90215403, + "num_input_tokens_seen": 110712208, + "router_z_loss_mlp": 0.36889648, + "step": 1334, + "time_per_iteration": 2.5217065811157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109442, + "balance_loss_mlp": 1.05813241, + "epoch": 0.25682954982685646, + "flos": 497892013056.0, + "grad_norm": 0.06578019441075163, + "language_loss": 0.90784955, + "learning_rate": 0.0008710438450660003, + "loss": 0.91879368, + "num_input_tokens_seen": 110783936, + "router_z_loss_mlp": 0.36303711, + "step": 1335, + "time_per_iteration": 2.651367425918579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093579, + "balance_loss_mlp": 1.05643392, + "epoch": 0.25702193151212005, + "flos": 457471176192.0, + "grad_norm": 0.07087944464696884, + "language_loss": 0.8744905, + "learning_rate": 0.0008708349453670064, + "loss": 0.88542628, + "num_input_tokens_seen": 110848560, + "router_z_loss_mlp": 0.37133789, + "step": 1336, + "time_per_iteration": 2.51411771774292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090754, + "balance_loss_mlp": 1.0543952, + "epoch": 0.2572143131973836, + "flos": 598002130944.0, + "grad_norm": 0.06329524505646734, + "language_loss": 0.91480416, + "learning_rate": 0.0008706259016971185, + "loss": 0.92571175, + "num_input_tokens_seen": 110922672, + "router_z_loss_mlp": 0.36401367, + "step": 1337, + "time_per_iteration": 2.754173517227173 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096487, + "balance_loss_mlp": 1.0589596, + "epoch": 0.25740669488264717, + "flos": 698004559872.0, + "grad_norm": 0.06697174190166053, + "language_loss": 0.83331275, + "learning_rate": 0.0008704167141374944, + "loss": 0.84427762, + "num_input_tokens_seen": 110995456, + "router_z_loss_mlp": 0.375, + "step": 1338, + "time_per_iteration": 2.795552968978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011015, + "balance_loss_mlp": 1.06385398, + "epoch": 0.25759907656791076, + "flos": 502130409984.0, + "grad_norm": 0.06639008708045263, + "language_loss": 0.88657552, + "learning_rate": 0.0008702073827693482, + "loss": 0.89759052, + "num_input_tokens_seen": 111069568, + "router_z_loss_mlp": 0.3762207, + "step": 1339, + "time_per_iteration": 2.6935572624206543 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103743, + "balance_loss_mlp": 1.065763, + "epoch": 0.2577914582531743, + "flos": 773541425664.0, + "grad_norm": 0.06917089880544881, + "language_loss": 0.88938046, + "learning_rate": 0.0008699979076739494, + "loss": 0.90041792, + "num_input_tokens_seen": 111142608, + "router_z_loss_mlp": 0.37963867, + "step": 1340, + "time_per_iteration": 2.951148509979248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102218, + "balance_loss_mlp": 1.06552505, + "epoch": 0.2579838399384379, + "flos": 459431032320.0, + "grad_norm": 0.07085954822691051, + "language_loss": 0.88831556, + "learning_rate": 0.0008697882889326234, + "loss": 0.89933777, + "num_input_tokens_seen": 111206336, + "router_z_loss_mlp": 0.36669922, + "step": 1341, + "time_per_iteration": 2.492182731628418 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105931, + "balance_loss_mlp": 1.06916654, + "epoch": 0.2581762216237014, + "flos": 568917029376.0, + "grad_norm": 0.060702491086151805, + "language_loss": 0.86630756, + "learning_rate": 0.0008695785266267515, + "loss": 0.8773669, + "num_input_tokens_seen": 111276736, + "router_z_loss_mlp": 0.36816406, + "step": 1342, + "time_per_iteration": 2.6635031700134277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111038, + "balance_loss_mlp": 1.07448828, + "epoch": 0.258368603308965, + "flos": 603906430464.0, + "grad_norm": 0.06467765584173796, + "language_loss": 0.83112109, + "learning_rate": 0.0008693686208377704, + "loss": 0.84223145, + "num_input_tokens_seen": 111353856, + "router_z_loss_mlp": 0.36547852, + "step": 1343, + "time_per_iteration": 2.7769596576690674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105659, + "balance_loss_mlp": 1.06975329, + "epoch": 0.2585609849942285, + "flos": 491204929536.0, + "grad_norm": 0.06376456739082713, + "language_loss": 0.88889539, + "learning_rate": 0.0008691585716471733, + "loss": 0.89995199, + "num_input_tokens_seen": 111424960, + "router_z_loss_mlp": 0.35913086, + "step": 1344, + "time_per_iteration": 2.6467716693878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104044, + "balance_loss_mlp": 1.06809044, + "epoch": 0.2587533666794921, + "flos": 640455607296.0, + "grad_norm": 0.057733681270749564, + "language_loss": 0.85255873, + "learning_rate": 0.0008689483791365079, + "loss": 0.86359918, + "num_input_tokens_seen": 111505248, + "router_z_loss_mlp": 0.35961914, + "step": 1345, + "time_per_iteration": 2.8041999340057373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099237, + "balance_loss_mlp": 1.06380773, + "epoch": 0.2589457483647557, + "flos": 576564397056.0, + "grad_norm": 0.05015471530609978, + "language_loss": 0.89365089, + "learning_rate": 0.0008687380433873786, + "loss": 0.9046433, + "num_input_tokens_seen": 111581936, + "router_z_loss_mlp": 0.35473633, + "step": 1346, + "time_per_iteration": 2.7955591678619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101447, + "balance_loss_mlp": 1.06630445, + "epoch": 0.25913813005001923, + "flos": 535164337152.0, + "grad_norm": 0.06074647569776127, + "language_loss": 0.82164252, + "learning_rate": 0.0008685275644814448, + "loss": 0.83265698, + "num_input_tokens_seen": 111651456, + "router_z_loss_mlp": 0.3515625, + "step": 1347, + "time_per_iteration": 2.6922154426574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100414, + "balance_loss_mlp": 1.06419861, + "epoch": 0.2593305117352828, + "flos": 720713908224.0, + "grad_norm": 0.05981927153656866, + "language_loss": 0.8445859, + "learning_rate": 0.0008683169425004216, + "loss": 0.85558999, + "num_input_tokens_seen": 111731712, + "router_z_loss_mlp": 0.36230469, + "step": 1348, + "time_per_iteration": 2.8701395988464355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108751, + "balance_loss_mlp": 1.05186677, + "epoch": 0.25952289342054635, + "flos": 709782635520.0, + "grad_norm": 0.06994851779161643, + "language_loss": 0.83445579, + "learning_rate": 0.0008681061775260799, + "loss": 0.84533083, + "num_input_tokens_seen": 111800752, + "router_z_loss_mlp": 0.35644531, + "step": 1349, + "time_per_iteration": 2.8206968307495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096473, + "balance_loss_mlp": 1.06032848, + "epoch": 0.25971527510580994, + "flos": 455688820224.0, + "grad_norm": 0.06118298275127208, + "language_loss": 0.91987318, + "learning_rate": 0.0008678952696402458, + "loss": 0.93083793, + "num_input_tokens_seen": 111866752, + "router_z_loss_mlp": 0.36132812, + "step": 1350, + "time_per_iteration": 2.5547540187835693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108511, + "balance_loss_mlp": 1.04932308, + "epoch": 0.25990765679107347, + "flos": 612223100928.0, + "grad_norm": 0.04808004024566397, + "language_loss": 0.86496055, + "learning_rate": 0.000867684218924801, + "loss": 0.8758117, + "num_input_tokens_seen": 111951328, + "router_z_loss_mlp": 0.35791016, + "step": 1351, + "time_per_iteration": 2.8406949043273926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110301, + "balance_loss_mlp": 1.09857082, + "epoch": 0.26010003847633706, + "flos": 1537105766400.0, + "grad_norm": 0.059206679514604114, + "language_loss": 0.78947091, + "learning_rate": 0.0008674730254616827, + "loss": 0.80057395, + "num_input_tokens_seen": 112182272, + "router_z_loss_mlp": 0.1171875, + "step": 1352, + "time_per_iteration": 4.8775153160095215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082749, + "balance_loss_mlp": 1.04753494, + "epoch": 0.2602924201616006, + "flos": 715947393024.0, + "grad_norm": 0.046134849134736367, + "language_loss": 0.85103661, + "learning_rate": 0.0008672616893328834, + "loss": 0.86186409, + "num_input_tokens_seen": 112261760, + "router_z_loss_mlp": 0.35253906, + "step": 1353, + "time_per_iteration": 2.98063588142395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108322, + "balance_loss_mlp": 1.04764819, + "epoch": 0.2604848018468642, + "flos": 643241917440.0, + "grad_norm": 0.060512322591449175, + "language_loss": 0.9000203, + "learning_rate": 0.0008670502106204512, + "loss": 0.91085243, + "num_input_tokens_seen": 112339136, + "router_z_loss_mlp": 0.35595703, + "step": 1354, + "time_per_iteration": 2.832679271697998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087805, + "balance_loss_mlp": 1.05073047, + "epoch": 0.26067718353212777, + "flos": 516783545856.0, + "grad_norm": 0.05860289542603218, + "language_loss": 0.8165139, + "learning_rate": 0.0008668385894064892, + "loss": 0.82739192, + "num_input_tokens_seen": 112409872, + "router_z_loss_mlp": 0.37084961, + "step": 1355, + "time_per_iteration": 2.6204822063446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086317, + "balance_loss_mlp": 1.05143666, + "epoch": 0.2608695652173913, + "flos": 822361890816.0, + "grad_norm": 0.0623840657908754, + "language_loss": 0.88803548, + "learning_rate": 0.0008666268257731562, + "loss": 0.8988986, + "num_input_tokens_seen": 112495616, + "router_z_loss_mlp": 0.34912109, + "step": 1356, + "time_per_iteration": 3.113147735595703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109295, + "balance_loss_mlp": 1.0566628, + "epoch": 0.2610619469026549, + "flos": 1007451744768.0, + "grad_norm": 0.056693012024963345, + "language_loss": 0.85794425, + "learning_rate": 0.0008664149198026662, + "loss": 0.86887372, + "num_input_tokens_seen": 112575168, + "router_z_loss_mlp": 0.36279297, + "step": 1357, + "time_per_iteration": 3.2569541931152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095714, + "balance_loss_mlp": 1.06040418, + "epoch": 0.2612543285879184, + "flos": 536523291648.0, + "grad_norm": 0.061594313952015485, + "language_loss": 0.88599586, + "learning_rate": 0.0008662028715772883, + "loss": 0.89695299, + "num_input_tokens_seen": 112648480, + "router_z_loss_mlp": 0.35351562, + "step": 1358, + "time_per_iteration": 2.6102256774902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100258, + "balance_loss_mlp": 1.06475711, + "epoch": 0.261446710273182, + "flos": 519166803456.0, + "grad_norm": 0.04975036534081278, + "language_loss": 0.85662109, + "learning_rate": 0.0008659906811793467, + "loss": 0.86762363, + "num_input_tokens_seen": 112719856, + "router_z_loss_mlp": 0.35546875, + "step": 1359, + "time_per_iteration": 2.6921935081481934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101082, + "balance_loss_mlp": 1.06543839, + "epoch": 0.26163909195844554, + "flos": 582975055872.0, + "grad_norm": 0.06646109128582675, + "language_loss": 0.89397144, + "learning_rate": 0.0008657783486912215, + "loss": 0.90498233, + "num_input_tokens_seen": 112795088, + "router_z_loss_mlp": 0.35693359, + "step": 1360, + "time_per_iteration": 2.7003283500671387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110494, + "balance_loss_mlp": 1.06920147, + "epoch": 0.2618314736437091, + "flos": 958362057216.0, + "grad_norm": 0.06344844215605515, + "language_loss": 0.89840877, + "learning_rate": 0.0008655658741953472, + "loss": 0.90945816, + "num_input_tokens_seen": 112879888, + "router_z_loss_mlp": 0.35742188, + "step": 1361, + "time_per_iteration": 3.207960844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101357, + "balance_loss_mlp": 1.0664053, + "epoch": 0.26202385532897265, + "flos": 574530347520.0, + "grad_norm": 0.04606923720206454, + "language_loss": 0.88105857, + "learning_rate": 0.0008653532577742136, + "loss": 0.89207214, + "num_input_tokens_seen": 112952208, + "router_z_loss_mlp": 0.34960938, + "step": 1362, + "time_per_iteration": 2.69209885597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091565, + "balance_loss_mlp": 1.05744767, + "epoch": 0.26221623701423624, + "flos": 445240585728.0, + "grad_norm": 0.05480512848555835, + "language_loss": 0.87200153, + "learning_rate": 0.0008651404995103659, + "loss": 0.88291717, + "num_input_tokens_seen": 113017472, + "router_z_loss_mlp": 0.34155273, + "step": 1363, + "time_per_iteration": 2.5325255393981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095927, + "balance_loss_mlp": 1.06164205, + "epoch": 0.26240861869949983, + "flos": 535459700736.0, + "grad_norm": 0.04992660146640532, + "language_loss": 0.870713, + "learning_rate": 0.0008649275994864041, + "loss": 0.8816722, + "num_input_tokens_seen": 113090000, + "router_z_loss_mlp": 0.34301758, + "step": 1364, + "time_per_iteration": 2.682365894317627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098846, + "balance_loss_mlp": 1.06267846, + "epoch": 0.26260100038476336, + "flos": 564940500480.0, + "grad_norm": 0.05369640644722127, + "language_loss": 0.83917898, + "learning_rate": 0.0008647145577849834, + "loss": 0.85016745, + "num_input_tokens_seen": 113169424, + "router_z_loss_mlp": 0.36157227, + "step": 1365, + "time_per_iteration": 2.8129918575286865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102304, + "balance_loss_mlp": 1.06701851, + "epoch": 0.26279338207002695, + "flos": 612745426944.0, + "grad_norm": 0.045782565775991005, + "language_loss": 0.82809973, + "learning_rate": 0.0008645013744888139, + "loss": 0.83912277, + "num_input_tokens_seen": 113256752, + "router_z_loss_mlp": 0.35327148, + "step": 1366, + "time_per_iteration": 2.8523411750793457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101615, + "balance_loss_mlp": 1.06730664, + "epoch": 0.2629857637552905, + "flos": 522555425280.0, + "grad_norm": 0.0597350257589219, + "language_loss": 0.87579656, + "learning_rate": 0.0008642880496806607, + "loss": 0.88681269, + "num_input_tokens_seen": 113330512, + "router_z_loss_mlp": 0.34350586, + "step": 1367, + "time_per_iteration": 2.766350507736206 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105811, + "balance_loss_mlp": 1.0706687, + "epoch": 0.26317814544055407, + "flos": 534273864192.0, + "grad_norm": 0.05812227598952832, + "language_loss": 0.84219468, + "learning_rate": 0.0008640745834433437, + "loss": 0.85325277, + "num_input_tokens_seen": 113409088, + "router_z_loss_mlp": 0.35205078, + "step": 1368, + "time_per_iteration": 2.7220964431762695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100459, + "balance_loss_mlp": 1.06553102, + "epoch": 0.2633705271258176, + "flos": 555235762176.0, + "grad_norm": 0.06954601812969684, + "language_loss": 0.86862296, + "learning_rate": 0.000863860975859738, + "loss": 0.87962759, + "num_input_tokens_seen": 113486624, + "router_z_loss_mlp": 0.34960938, + "step": 1369, + "time_per_iteration": 2.8985280990600586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094199, + "balance_loss_mlp": 1.05931866, + "epoch": 0.2635629088110812, + "flos": 552136711680.0, + "grad_norm": 0.06493737783890446, + "language_loss": 0.88711715, + "learning_rate": 0.0008636472270127733, + "loss": 0.89805913, + "num_input_tokens_seen": 113555776, + "router_z_loss_mlp": 0.34936523, + "step": 1370, + "time_per_iteration": 2.634615182876587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090389, + "balance_loss_mlp": 1.05498338, + "epoch": 0.2637552904963448, + "flos": 455752839168.0, + "grad_norm": 0.062476231294863314, + "language_loss": 0.89913595, + "learning_rate": 0.0008634333369854345, + "loss": 0.91003978, + "num_input_tokens_seen": 113624208, + "router_z_loss_mlp": 0.35424805, + "step": 1371, + "time_per_iteration": 2.5908331871032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082759, + "balance_loss_mlp": 1.04818797, + "epoch": 0.2639476721816083, + "flos": 612847323648.0, + "grad_norm": 0.05509554660574217, + "language_loss": 0.87495965, + "learning_rate": 0.0008632193058607608, + "loss": 0.88578725, + "num_input_tokens_seen": 113698544, + "router_z_loss_mlp": 0.34594727, + "step": 1372, + "time_per_iteration": 2.6963188648223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108436, + "balance_loss_mlp": 1.04878759, + "epoch": 0.2641400538668719, + "flos": 571645112832.0, + "grad_norm": 0.05982264925210271, + "language_loss": 0.81028771, + "learning_rate": 0.0008630051337218466, + "loss": 0.82113135, + "num_input_tokens_seen": 113769024, + "router_z_loss_mlp": 0.35595703, + "step": 1373, + "time_per_iteration": 2.644540786743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079985, + "balance_loss_mlp": 1.04582, + "epoch": 0.2643324355521354, + "flos": 581979866112.0, + "grad_norm": 0.08561984623812412, + "language_loss": 0.82428128, + "learning_rate": 0.0008627908206518409, + "loss": 0.8350811, + "num_input_tokens_seen": 113836320, + "router_z_loss_mlp": 0.34179688, + "step": 1374, + "time_per_iteration": 2.660578966140747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110354, + "balance_loss_mlp": 1.08904421, + "epoch": 0.264524817237399, + "flos": 1543845284352.0, + "grad_norm": 0.03725698642258328, + "language_loss": 0.75151253, + "learning_rate": 0.0008625763667339472, + "loss": 0.76254791, + "num_input_tokens_seen": 114065040, + "router_z_loss_mlp": 0.14453125, + "step": 1375, + "time_per_iteration": 4.9595324993133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079461, + "balance_loss_mlp": 1.04474711, + "epoch": 0.26471719892266254, + "flos": 517783117824.0, + "grad_norm": 0.05493851972821551, + "language_loss": 0.91330564, + "learning_rate": 0.0008623617720514241, + "loss": 0.92410028, + "num_input_tokens_seen": 114133488, + "router_z_loss_mlp": 0.34741211, + "step": 1376, + "time_per_iteration": 2.5929205417633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090244, + "balance_loss_mlp": 1.05498242, + "epoch": 0.26490958060792613, + "flos": 516936314880.0, + "grad_norm": 0.08106601153347975, + "language_loss": 0.84946424, + "learning_rate": 0.0008621470366875848, + "loss": 0.8603667, + "num_input_tokens_seen": 114200704, + "router_z_loss_mlp": 0.3527832, + "step": 1377, + "time_per_iteration": 2.5729684829711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084439, + "balance_loss_mlp": 1.0497725, + "epoch": 0.26510196229318966, + "flos": 596298350592.0, + "grad_norm": 0.05588669268878349, + "language_loss": 0.87771004, + "learning_rate": 0.0008619321607257966, + "loss": 0.88855445, + "num_input_tokens_seen": 114272160, + "router_z_loss_mlp": 0.34692383, + "step": 1378, + "time_per_iteration": 2.6708004474639893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082921, + "balance_loss_mlp": 1.0483501, + "epoch": 0.26529434397845325, + "flos": 685488780288.0, + "grad_norm": 0.051774701706919043, + "language_loss": 0.82311988, + "learning_rate": 0.000861717144249482, + "loss": 0.83394915, + "num_input_tokens_seen": 114347904, + "router_z_loss_mlp": 0.34594727, + "step": 1379, + "time_per_iteration": 2.8249831199645996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081979, + "balance_loss_mlp": 1.0468595, + "epoch": 0.26548672566371684, + "flos": 424127328768.0, + "grad_norm": 0.06288210815556809, + "language_loss": 0.90205348, + "learning_rate": 0.0008615019873421175, + "loss": 0.91287327, + "num_input_tokens_seen": 114409952, + "router_z_loss_mlp": 0.35131836, + "step": 1380, + "time_per_iteration": 2.455320358276367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108606, + "balance_loss_mlp": 1.05108428, + "epoch": 0.26567910734898037, + "flos": 489619012608.0, + "grad_norm": 0.05393715583789803, + "language_loss": 0.8609767, + "learning_rate": 0.0008612866900872349, + "loss": 0.87183726, + "num_input_tokens_seen": 114474832, + "router_z_loss_mlp": 0.35009766, + "step": 1381, + "time_per_iteration": 2.54070782661438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083002, + "balance_loss_mlp": 1.04895568, + "epoch": 0.26587148903424396, + "flos": 533947977216.0, + "grad_norm": 0.05290962754614328, + "language_loss": 0.88052452, + "learning_rate": 0.0008610712525684197, + "loss": 0.8913545, + "num_input_tokens_seen": 114545152, + "router_z_loss_mlp": 0.34082031, + "step": 1382, + "time_per_iteration": 2.6350595951080322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084878, + "balance_loss_mlp": 1.05049801, + "epoch": 0.2660638707195075, + "flos": 1017067732992.0, + "grad_norm": 0.06267977315545337, + "language_loss": 0.84534729, + "learning_rate": 0.0008608556748693121, + "loss": 0.85619605, + "num_input_tokens_seen": 114626512, + "router_z_loss_mlp": 0.34423828, + "step": 1383, + "time_per_iteration": 3.231172561645508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086499, + "balance_loss_mlp": 1.05216646, + "epoch": 0.2662562524047711, + "flos": 523712148480.0, + "grad_norm": 0.0585640776606728, + "language_loss": 0.86247015, + "learning_rate": 0.000860639957073607, + "loss": 0.87333512, + "num_input_tokens_seen": 114701008, + "router_z_loss_mlp": 0.34375, + "step": 1384, + "time_per_iteration": 2.72265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108718, + "balance_loss_mlp": 1.05280018, + "epoch": 0.2664486340900346, + "flos": 552107598336.0, + "grad_norm": 0.07312693577598182, + "language_loss": 0.87888551, + "learning_rate": 0.0008604240992650534, + "loss": 0.88975734, + "num_input_tokens_seen": 114771984, + "router_z_loss_mlp": 0.34423828, + "step": 1385, + "time_per_iteration": 2.6524593830108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079729, + "balance_loss_mlp": 1.0455637, + "epoch": 0.2666410157752982, + "flos": 469895233536.0, + "grad_norm": 0.058731941016447735, + "language_loss": 0.89070451, + "learning_rate": 0.0008602081015274545, + "loss": 0.90150183, + "num_input_tokens_seen": 114844800, + "router_z_loss_mlp": 0.34179688, + "step": 1386, + "time_per_iteration": 2.7026963233947754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083917, + "balance_loss_mlp": 1.04953694, + "epoch": 0.2668333974605617, + "flos": 569645968896.0, + "grad_norm": 0.04572049987167494, + "language_loss": 0.83031899, + "learning_rate": 0.0008599919639446684, + "loss": 0.84115815, + "num_input_tokens_seen": 114918544, + "router_z_loss_mlp": 0.34423828, + "step": 1387, + "time_per_iteration": 2.6891515254974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083945, + "balance_loss_mlp": 1.04920745, + "epoch": 0.2670257791458253, + "flos": 398755325952.0, + "grad_norm": 0.06113709644372323, + "language_loss": 0.80263156, + "learning_rate": 0.000859775686600607, + "loss": 0.81347102, + "num_input_tokens_seen": 114984272, + "router_z_loss_mlp": 0.34765625, + "step": 1388, + "time_per_iteration": 2.5367043018341064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088195, + "balance_loss_mlp": 1.05400586, + "epoch": 0.2672181608310889, + "flos": 515587534848.0, + "grad_norm": 0.07715457421599592, + "language_loss": 0.85045016, + "learning_rate": 0.0008595592695792367, + "loss": 0.86133218, + "num_input_tokens_seen": 115054800, + "router_z_loss_mlp": 0.34228516, + "step": 1389, + "time_per_iteration": 2.653684139251709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097788, + "balance_loss_mlp": 1.06348014, + "epoch": 0.26741054251635243, + "flos": 507270864384.0, + "grad_norm": 0.05276083290405683, + "language_loss": 0.9085412, + "learning_rate": 0.0008593427129645778, + "loss": 0.91951907, + "num_input_tokens_seen": 115120928, + "router_z_loss_mlp": 0.34350586, + "step": 1390, + "time_per_iteration": 2.5497426986694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100169, + "balance_loss_mlp": 1.06512117, + "epoch": 0.267602924201616, + "flos": 576357783552.0, + "grad_norm": 0.0907689109524766, + "language_loss": 0.85371816, + "learning_rate": 0.0008591260168407052, + "loss": 0.86471987, + "num_input_tokens_seen": 115196688, + "router_z_loss_mlp": 0.35083008, + "step": 1391, + "time_per_iteration": 2.752777576446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099505, + "balance_loss_mlp": 1.06598353, + "epoch": 0.26779530588687955, + "flos": 523731087360.0, + "grad_norm": 0.05201595058269412, + "language_loss": 0.83216429, + "learning_rate": 0.0008589091812917479, + "loss": 0.84315932, + "num_input_tokens_seen": 115264912, + "router_z_loss_mlp": 0.33544922, + "step": 1392, + "time_per_iteration": 2.602858781814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092875, + "balance_loss_mlp": 1.05897164, + "epoch": 0.26798768757214314, + "flos": 556508938752.0, + "grad_norm": 0.054199587407170555, + "language_loss": 0.85476619, + "learning_rate": 0.0008586922064018887, + "loss": 0.86569488, + "num_input_tokens_seen": 115334672, + "router_z_loss_mlp": 0.33911133, + "step": 1393, + "time_per_iteration": 2.663135528564453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110003, + "balance_loss_mlp": 1.0641005, + "epoch": 0.2681800692574067, + "flos": 930246004224.0, + "grad_norm": 0.05606615550920643, + "language_loss": 0.89228028, + "learning_rate": 0.0008584750922553651, + "loss": 0.90328062, + "num_input_tokens_seen": 115420032, + "router_z_loss_mlp": 0.35961914, + "step": 1394, + "time_per_iteration": 3.126030206680298 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094414, + "balance_loss_mlp": 1.06020141, + "epoch": 0.26837245094267026, + "flos": 700771931136.0, + "grad_norm": 0.055333821001128054, + "language_loss": 0.83724457, + "learning_rate": 0.0008582578389364677, + "loss": 0.84818876, + "num_input_tokens_seen": 115492576, + "router_z_loss_mlp": 0.3425293, + "step": 1395, + "time_per_iteration": 2.858774423599243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096924, + "balance_loss_mlp": 1.06187642, + "epoch": 0.26856483262793385, + "flos": 592892199936.0, + "grad_norm": 0.04773968262798697, + "language_loss": 0.9195987, + "learning_rate": 0.0008580404465295422, + "loss": 0.93056792, + "num_input_tokens_seen": 115568368, + "router_z_loss_mlp": 0.35058594, + "step": 1396, + "time_per_iteration": 2.7737646102905273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096258, + "balance_loss_mlp": 1.06190252, + "epoch": 0.2687572143131974, + "flos": 713943866880.0, + "grad_norm": 0.07288281155022573, + "language_loss": 0.88208908, + "learning_rate": 0.0008578229151189876, + "loss": 0.89305162, + "num_input_tokens_seen": 115651536, + "router_z_loss_mlp": 0.34375, + "step": 1397, + "time_per_iteration": 2.9974029064178467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087674, + "balance_loss_mlp": 1.05441451, + "epoch": 0.26894959599846097, + "flos": 467481452544.0, + "grad_norm": 0.0581153622766974, + "language_loss": 0.81433654, + "learning_rate": 0.0008576052447892573, + "loss": 0.82521319, + "num_input_tokens_seen": 115715696, + "router_z_loss_mlp": 0.33276367, + "step": 1398, + "time_per_iteration": 2.586427688598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090085, + "balance_loss_mlp": 1.05551457, + "epoch": 0.2691419776837245, + "flos": 468470850048.0, + "grad_norm": 0.08083264737918114, + "language_loss": 0.86589479, + "learning_rate": 0.000857387435624858, + "loss": 0.87679559, + "num_input_tokens_seen": 115780928, + "router_z_loss_mlp": 0.34619141, + "step": 1399, + "time_per_iteration": 2.5227789878845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096966, + "balance_loss_mlp": 1.06239533, + "epoch": 0.2693343593689881, + "flos": 937244418048.0, + "grad_norm": 0.0443934808912178, + "language_loss": 0.88525635, + "learning_rate": 0.0008571694877103513, + "loss": 0.89622605, + "num_input_tokens_seen": 115874432, + "router_z_loss_mlp": 0.34594727, + "step": 1400, + "time_per_iteration": 3.252573251724243 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095335, + "balance_loss_mlp": 1.06064546, + "epoch": 0.2695267410542516, + "flos": 577303511040.0, + "grad_norm": 0.05297583192015558, + "language_loss": 0.87603962, + "learning_rate": 0.0008569514011303515, + "loss": 0.88699305, + "num_input_tokens_seen": 115956608, + "router_z_loss_mlp": 0.34716797, + "step": 1401, + "time_per_iteration": 2.7824223041534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092716, + "balance_loss_mlp": 1.0577879, + "epoch": 0.2697191227395152, + "flos": 556539462144.0, + "grad_norm": 0.06414718170709632, + "language_loss": 0.87859815, + "learning_rate": 0.0008567331759695277, + "loss": 0.88952529, + "num_input_tokens_seen": 116031728, + "router_z_loss_mlp": 0.34985352, + "step": 1402, + "time_per_iteration": 2.7109498977661133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098142, + "balance_loss_mlp": 1.06183052, + "epoch": 0.26991150442477874, + "flos": 529024310784.0, + "grad_norm": 0.05462837975359106, + "language_loss": 0.86148876, + "learning_rate": 0.0008565148123126023, + "loss": 0.87247014, + "num_input_tokens_seen": 116104288, + "router_z_loss_mlp": 0.36328125, + "step": 1403, + "time_per_iteration": 2.6526310443878174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088301, + "balance_loss_mlp": 1.05289555, + "epoch": 0.2701038861100423, + "flos": 531737837568.0, + "grad_norm": 0.12276519374226595, + "language_loss": 0.86177158, + "learning_rate": 0.0008562963102443516, + "loss": 0.87265456, + "num_input_tokens_seen": 116177920, + "router_z_loss_mlp": 0.35400391, + "step": 1404, + "time_per_iteration": 2.6809849739074707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092814, + "balance_loss_mlp": 1.05757618, + "epoch": 0.2702962677953059, + "flos": 734908737024.0, + "grad_norm": 0.05743337882617235, + "language_loss": 0.85265231, + "learning_rate": 0.0008560776698496056, + "loss": 0.86358047, + "num_input_tokens_seen": 116251680, + "router_z_loss_mlp": 0.3527832, + "step": 1405, + "time_per_iteration": 2.9008774757385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099947, + "balance_loss_mlp": 1.06420779, + "epoch": 0.27048864948056944, + "flos": 574453181952.0, + "grad_norm": 0.06281004106283315, + "language_loss": 0.85864103, + "learning_rate": 0.0008558588912132481, + "loss": 0.86964047, + "num_input_tokens_seen": 116327664, + "router_z_loss_mlp": 0.35742188, + "step": 1406, + "time_per_iteration": 2.8967840671539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01057025, + "balance_loss_mlp": 1.04519963, + "epoch": 0.27068103116583303, + "flos": 1423091953152.0, + "grad_norm": 0.03126478371356873, + "language_loss": 0.76458991, + "learning_rate": 0.0008556399744202163, + "loss": 0.77516007, + "num_input_tokens_seen": 116555152, + "router_z_loss_mlp": 0.11816406, + "step": 1407, + "time_per_iteration": 4.933698892593384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109874, + "balance_loss_mlp": 1.06400251, + "epoch": 0.27087341285109656, + "flos": 531742219776.0, + "grad_norm": 0.050597666424933845, + "language_loss": 0.82942683, + "learning_rate": 0.0008554209195555016, + "loss": 0.84041423, + "num_input_tokens_seen": 116626016, + "router_z_loss_mlp": 0.34765625, + "step": 1408, + "time_per_iteration": 2.6599888801574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093614, + "balance_loss_mlp": 1.0582329, + "epoch": 0.27106579453636015, + "flos": 581108332032.0, + "grad_norm": 0.058412744436649705, + "language_loss": 0.88199335, + "learning_rate": 0.0008552017267041483, + "loss": 0.89292949, + "num_input_tokens_seen": 116699152, + "router_z_loss_mlp": 0.35375977, + "step": 1409, + "time_per_iteration": 2.673828363418579 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091257, + "balance_loss_mlp": 1.05563736, + "epoch": 0.2712581762216237, + "flos": 506533160448.0, + "grad_norm": 0.05246666988179206, + "language_loss": 0.83264577, + "learning_rate": 0.0008549823959512549, + "loss": 0.84355831, + "num_input_tokens_seen": 116770912, + "router_z_loss_mlp": 0.35644531, + "step": 1410, + "time_per_iteration": 2.634523868560791 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091517, + "balance_loss_mlp": 1.05451441, + "epoch": 0.27145055790688727, + "flos": 997019476992.0, + "grad_norm": 0.050905982394943275, + "language_loss": 0.86668658, + "learning_rate": 0.0008547629273819728, + "loss": 0.87760168, + "num_input_tokens_seen": 116863088, + "router_z_loss_mlp": 0.36987305, + "step": 1411, + "time_per_iteration": 3.3559322357177734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094566, + "balance_loss_mlp": 1.05875564, + "epoch": 0.2716429395921508, + "flos": 546420086784.0, + "grad_norm": 0.06363965087638479, + "language_loss": 0.83773881, + "learning_rate": 0.0008545433210815074, + "loss": 0.84868449, + "num_input_tokens_seen": 116929504, + "router_z_loss_mlp": 0.35839844, + "step": 1412, + "time_per_iteration": 2.607379913330078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109226, + "balance_loss_mlp": 1.05606771, + "epoch": 0.2718353212774144, + "flos": 572954605056.0, + "grad_norm": 0.05881941163427475, + "language_loss": 0.87753916, + "learning_rate": 0.0008543235771351176, + "loss": 0.88846171, + "num_input_tokens_seen": 117004064, + "router_z_loss_mlp": 0.36230469, + "step": 1413, + "time_per_iteration": 2.722318649291992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096323, + "balance_loss_mlp": 1.06118035, + "epoch": 0.272027702962678, + "flos": 643986823680.0, + "grad_norm": 0.044269909609048815, + "language_loss": 0.84649068, + "learning_rate": 0.0008541036956281154, + "loss": 0.85745388, + "num_input_tokens_seen": 117081328, + "router_z_loss_mlp": 0.3515625, + "step": 1414, + "time_per_iteration": 2.8785104751586914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100326, + "balance_loss_mlp": 1.0645628, + "epoch": 0.2722200846479415, + "flos": 653410755072.0, + "grad_norm": 0.0658433573318433, + "language_loss": 0.82281864, + "learning_rate": 0.0008538836766458665, + "loss": 0.83382189, + "num_input_tokens_seen": 117156544, + "router_z_loss_mlp": 0.35791016, + "step": 1415, + "time_per_iteration": 2.834148645401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098566, + "balance_loss_mlp": 1.06311321, + "epoch": 0.2724124663332051, + "flos": 579346324992.0, + "grad_norm": 0.07330345680392343, + "language_loss": 0.85275221, + "learning_rate": 0.0008536635202737897, + "loss": 0.86373788, + "num_input_tokens_seen": 117230208, + "router_z_loss_mlp": 0.35473633, + "step": 1416, + "time_per_iteration": 2.7886626720428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099993, + "balance_loss_mlp": 1.06513667, + "epoch": 0.2726048480184686, + "flos": 537178037760.0, + "grad_norm": 0.06667202152625065, + "language_loss": 0.82212454, + "learning_rate": 0.0008534432265973573, + "loss": 0.8331244, + "num_input_tokens_seen": 117298080, + "router_z_loss_mlp": 0.34912109, + "step": 1417, + "time_per_iteration": 2.604626417160034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095149, + "balance_loss_mlp": 1.05931497, + "epoch": 0.2727972297037322, + "flos": 995360776704.0, + "grad_norm": 0.08172035912068172, + "language_loss": 0.88052338, + "learning_rate": 0.000853222795702095, + "loss": 0.8914749, + "num_input_tokens_seen": 117396256, + "router_z_loss_mlp": 0.35839844, + "step": 1418, + "time_per_iteration": 3.391664505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095275, + "balance_loss_mlp": 1.06125307, + "epoch": 0.27298961138899575, + "flos": 605924513280.0, + "grad_norm": 0.05480231780963067, + "language_loss": 0.83608377, + "learning_rate": 0.0008530022276735813, + "loss": 0.84703648, + "num_input_tokens_seen": 117467936, + "router_z_loss_mlp": 0.34057617, + "step": 1419, + "time_per_iteration": 2.705235004425049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107985, + "balance_loss_mlp": 1.04666257, + "epoch": 0.27318199307425933, + "flos": 529059216384.0, + "grad_norm": 0.054542785174291425, + "language_loss": 0.85957551, + "learning_rate": 0.0008527815225974489, + "loss": 0.87037402, + "num_input_tokens_seen": 117538256, + "router_z_loss_mlp": 0.33203125, + "step": 1420, + "time_per_iteration": 2.654003620147705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087931, + "balance_loss_mlp": 1.05395651, + "epoch": 0.2733743747595229, + "flos": 408809272320.0, + "grad_norm": 0.06460584893454492, + "language_loss": 0.88538897, + "learning_rate": 0.0008525606805593829, + "loss": 0.89626825, + "num_input_tokens_seen": 117599488, + "router_z_loss_mlp": 0.33984375, + "step": 1421, + "time_per_iteration": 2.4287912845611572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087097, + "balance_loss_mlp": 1.05233574, + "epoch": 0.27356675644478645, + "flos": 515976030720.0, + "grad_norm": 0.055753761808712644, + "language_loss": 0.82379127, + "learning_rate": 0.0008523397016451213, + "loss": 0.8346622, + "num_input_tokens_seen": 117664240, + "router_z_loss_mlp": 0.34814453, + "step": 1422, + "time_per_iteration": 2.5620808601379395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096714, + "balance_loss_mlp": 1.0617379, + "epoch": 0.27375913813005004, + "flos": 1051914539520.0, + "grad_norm": 0.0481984630724129, + "language_loss": 0.87272507, + "learning_rate": 0.0008521185859404564, + "loss": 0.88369215, + "num_input_tokens_seen": 117754768, + "router_z_loss_mlp": 0.34985352, + "step": 1423, + "time_per_iteration": 3.361171245574951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085067, + "balance_loss_mlp": 1.05037737, + "epoch": 0.27395151981531357, + "flos": 624507535872.0, + "grad_norm": 0.05502068897485729, + "language_loss": 0.89717311, + "learning_rate": 0.0008518973335312326, + "loss": 0.90802383, + "num_input_tokens_seen": 117832816, + "router_z_loss_mlp": 0.34716797, + "step": 1424, + "time_per_iteration": 2.7964961528778076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088932, + "balance_loss_mlp": 1.05390823, + "epoch": 0.27414390150057716, + "flos": 550112836608.0, + "grad_norm": 0.056708357312241935, + "language_loss": 0.83878243, + "learning_rate": 0.0008516759445033477, + "loss": 0.84967172, + "num_input_tokens_seen": 117899168, + "router_z_loss_mlp": 0.35058594, + "step": 1425, + "time_per_iteration": 2.6100170612335205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090578, + "balance_loss_mlp": 1.05603087, + "epoch": 0.2743362831858407, + "flos": 539596200960.0, + "grad_norm": 0.061048707375716146, + "language_loss": 0.84983361, + "learning_rate": 0.0008514544189427526, + "loss": 0.86073935, + "num_input_tokens_seen": 117972384, + "router_z_loss_mlp": 0.34594727, + "step": 1426, + "time_per_iteration": 2.6465015411376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088002, + "balance_loss_mlp": 1.05347919, + "epoch": 0.2745286648711043, + "flos": 468352986624.0, + "grad_norm": 0.061383055639382046, + "language_loss": 0.8704657, + "learning_rate": 0.0008512327569354511, + "loss": 0.88134569, + "num_input_tokens_seen": 118039584, + "router_z_loss_mlp": 0.34570312, + "step": 1427, + "time_per_iteration": 2.5696229934692383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087297, + "balance_loss_mlp": 1.05212998, + "epoch": 0.2747210465563678, + "flos": 472617524736.0, + "grad_norm": 0.05941983459852813, + "language_loss": 0.8349936, + "learning_rate": 0.0008510109585675001, + "loss": 0.84586656, + "num_input_tokens_seen": 118108352, + "router_z_loss_mlp": 0.35180664, + "step": 1428, + "time_per_iteration": 2.6195123195648193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086947, + "balance_loss_mlp": 1.07245111, + "epoch": 0.2749134282416314, + "flos": 1314345070080.0, + "grad_norm": 0.037284634304165044, + "language_loss": 0.81153345, + "learning_rate": 0.0008507890239250093, + "loss": 0.82240289, + "num_input_tokens_seen": 118331120, + "router_z_loss_mlp": 0.14453125, + "step": 1429, + "time_per_iteration": 4.714681625366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083808, + "balance_loss_mlp": 1.04938078, + "epoch": 0.275105809926895, + "flos": 970445670912.0, + "grad_norm": 0.07686972857934649, + "language_loss": 0.80942416, + "learning_rate": 0.0008505669530941415, + "loss": 0.82026225, + "num_input_tokens_seen": 118415872, + "router_z_loss_mlp": 0.34472656, + "step": 1430, + "time_per_iteration": 3.2975006103515625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080869, + "balance_loss_mlp": 1.04715657, + "epoch": 0.2752981916121585, + "flos": 527089185792.0, + "grad_norm": 0.061626933195079385, + "language_loss": 0.84357536, + "learning_rate": 0.000850344746161112, + "loss": 0.85438406, + "num_input_tokens_seen": 118483008, + "router_z_loss_mlp": 0.33740234, + "step": 1431, + "time_per_iteration": 2.596623182296753 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079512, + "balance_loss_mlp": 1.04487014, + "epoch": 0.2754905732974221, + "flos": 453487444992.0, + "grad_norm": 0.05883177646218185, + "language_loss": 0.87880194, + "learning_rate": 0.0008501224032121894, + "loss": 0.88959706, + "num_input_tokens_seen": 118545840, + "router_z_loss_mlp": 0.34692383, + "step": 1432, + "time_per_iteration": 2.5134201049804688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082699, + "balance_loss_mlp": 1.04817569, + "epoch": 0.27568295498268564, + "flos": 497216918016.0, + "grad_norm": 0.05235854639463291, + "language_loss": 0.82002538, + "learning_rate": 0.0008498999243336946, + "loss": 0.83085239, + "num_input_tokens_seen": 118615168, + "router_z_loss_mlp": 0.34570312, + "step": 1433, + "time_per_iteration": 2.601771593093872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095225, + "balance_loss_mlp": 1.06060696, + "epoch": 0.2758753366679492, + "flos": 607890161664.0, + "grad_norm": 0.05891633941102979, + "language_loss": 0.87444806, + "learning_rate": 0.0008496773096120021, + "loss": 0.8854003, + "num_input_tokens_seen": 118690384, + "router_z_loss_mlp": 0.34643555, + "step": 1434, + "time_per_iteration": 2.788516044616699 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096763, + "balance_loss_mlp": 1.06212115, + "epoch": 0.27606771835321275, + "flos": 739803290112.0, + "grad_norm": 0.06770297286276174, + "language_loss": 0.84185004, + "learning_rate": 0.0008494545591335381, + "loss": 0.85281765, + "num_input_tokens_seen": 118763024, + "router_z_loss_mlp": 0.34667969, + "step": 1435, + "time_per_iteration": 2.8759751319885254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110127, + "balance_loss_mlp": 1.06736696, + "epoch": 0.27626010003847634, + "flos": 554279860224.0, + "grad_norm": 0.04450223786838935, + "language_loss": 0.87244952, + "learning_rate": 0.0008492316729847823, + "loss": 0.88346225, + "num_input_tokens_seen": 118845536, + "router_z_loss_mlp": 0.33935547, + "step": 1436, + "time_per_iteration": 2.781244993209839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094118, + "balance_loss_mlp": 1.06023908, + "epoch": 0.2764524817237399, + "flos": 542270439936.0, + "grad_norm": 0.055325808882979444, + "language_loss": 0.79874223, + "learning_rate": 0.0008490086512522664, + "loss": 0.80968338, + "num_input_tokens_seen": 118919008, + "router_z_loss_mlp": 0.33911133, + "step": 1437, + "time_per_iteration": 2.7197165489196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093138, + "balance_loss_mlp": 1.05913925, + "epoch": 0.27664486340900346, + "flos": 406027344384.0, + "grad_norm": 0.0539948754920925, + "language_loss": 0.90713239, + "learning_rate": 0.0008487854940225755, + "loss": 0.91806382, + "num_input_tokens_seen": 118981376, + "router_z_loss_mlp": 0.34008789, + "step": 1438, + "time_per_iteration": 2.438218593597412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094602, + "balance_loss_mlp": 1.06017423, + "epoch": 0.27683724509426705, + "flos": 521884712448.0, + "grad_norm": 0.06140365718889793, + "language_loss": 0.90140885, + "learning_rate": 0.0008485622013823466, + "loss": 0.91235483, + "num_input_tokens_seen": 119050560, + "router_z_loss_mlp": 0.34448242, + "step": 1439, + "time_per_iteration": 2.653393268585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102393, + "balance_loss_mlp": 1.06770289, + "epoch": 0.2770296267795306, + "flos": 535085761536.0, + "grad_norm": 0.07554461134761571, + "language_loss": 0.8283006, + "learning_rate": 0.00084833877341827, + "loss": 0.83932453, + "num_input_tokens_seen": 119121104, + "router_z_loss_mlp": 0.34692383, + "step": 1440, + "time_per_iteration": 2.6312928199768066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109947, + "balance_loss_mlp": 1.06587648, + "epoch": 0.27722200846479417, + "flos": 487747906560.0, + "grad_norm": 0.12145939933268801, + "language_loss": 0.8064183, + "learning_rate": 0.000848115210217088, + "loss": 0.81741297, + "num_input_tokens_seen": 119187712, + "router_z_loss_mlp": 0.33618164, + "step": 1441, + "time_per_iteration": 2.5490710735321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086774, + "balance_loss_mlp": 1.05167925, + "epoch": 0.2774143901500577, + "flos": 618012509184.0, + "grad_norm": 0.05766268366580332, + "language_loss": 0.82057106, + "learning_rate": 0.0008478915118655952, + "loss": 0.83143878, + "num_input_tokens_seen": 119259264, + "router_z_loss_mlp": 0.35131836, + "step": 1442, + "time_per_iteration": 2.710240602493286 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086605, + "balance_loss_mlp": 1.05160558, + "epoch": 0.2776067718353213, + "flos": 513563659776.0, + "grad_norm": 0.05774564569051742, + "language_loss": 0.86505657, + "learning_rate": 0.0008476676784506393, + "loss": 0.87592262, + "num_input_tokens_seen": 119328304, + "router_z_loss_mlp": 0.35009766, + "step": 1443, + "time_per_iteration": 2.636622667312622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108686, + "balance_loss_mlp": 1.0526228, + "epoch": 0.2777991535205848, + "flos": 1003985957376.0, + "grad_norm": 0.10311001825576924, + "language_loss": 0.82024419, + "learning_rate": 0.0008474437100591201, + "loss": 0.8311128, + "num_input_tokens_seen": 119412352, + "router_z_loss_mlp": 0.34277344, + "step": 1444, + "time_per_iteration": 3.282383918762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091274, + "balance_loss_mlp": 1.05517697, + "epoch": 0.2779915352058484, + "flos": 550005147648.0, + "grad_norm": 0.05151300271624721, + "language_loss": 0.85496646, + "learning_rate": 0.0008472196067779898, + "loss": 0.86587918, + "num_input_tokens_seen": 119484464, + "router_z_loss_mlp": 0.36108398, + "step": 1445, + "time_per_iteration": 2.703263998031616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092087, + "balance_loss_mlp": 1.05591917, + "epoch": 0.278183916891112, + "flos": 873444930048.0, + "grad_norm": 0.06736388569990436, + "language_loss": 0.85432607, + "learning_rate": 0.0008469953686942531, + "loss": 0.86524689, + "num_input_tokens_seen": 119557280, + "router_z_loss_mlp": 0.36181641, + "step": 1446, + "time_per_iteration": 3.0834743976593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087224, + "balance_loss_mlp": 1.05305839, + "epoch": 0.2783762985763755, + "flos": 623782978560.0, + "grad_norm": 0.06536474240751361, + "language_loss": 0.83167183, + "learning_rate": 0.0008467709958949668, + "loss": 0.84254414, + "num_input_tokens_seen": 119631232, + "router_z_loss_mlp": 0.34204102, + "step": 1447, + "time_per_iteration": 2.737135887145996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087199, + "balance_loss_mlp": 1.05362952, + "epoch": 0.2785686802616391, + "flos": 581571021312.0, + "grad_norm": 0.057056565872365954, + "language_loss": 0.85917461, + "learning_rate": 0.0008465464884672403, + "loss": 0.87004662, + "num_input_tokens_seen": 119700224, + "router_z_loss_mlp": 0.33569336, + "step": 1448, + "time_per_iteration": 2.6771810054779053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087224, + "balance_loss_mlp": 1.05346394, + "epoch": 0.27876106194690264, + "flos": 587032980480.0, + "grad_norm": 0.06237565084734976, + "language_loss": 0.85356677, + "learning_rate": 0.0008463218464982348, + "loss": 0.86443901, + "num_input_tokens_seen": 119781376, + "router_z_loss_mlp": 0.33789062, + "step": 1449, + "time_per_iteration": 2.799407720565796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086336, + "balance_loss_mlp": 1.05228984, + "epoch": 0.27895344363216623, + "flos": 875621574144.0, + "grad_norm": 0.06450477685794259, + "language_loss": 0.87800258, + "learning_rate": 0.0008460970700751645, + "loss": 0.88886595, + "num_input_tokens_seen": 119856672, + "router_z_loss_mlp": 0.34057617, + "step": 1450, + "time_per_iteration": 3.0517759323120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086942, + "balance_loss_mlp": 1.05322921, + "epoch": 0.27914582531742976, + "flos": 603630005760.0, + "grad_norm": 0.06893143963997089, + "language_loss": 0.87761652, + "learning_rate": 0.000845872159285295, + "loss": 0.88848597, + "num_input_tokens_seen": 119929008, + "router_z_loss_mlp": 0.33740234, + "step": 1451, + "time_per_iteration": 2.6964316368103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01042481, + "balance_loss_mlp": 1.0276041, + "epoch": 0.27933820700269335, + "flos": 1496892953088.0, + "grad_norm": 0.0242162718076618, + "language_loss": 0.77766848, + "learning_rate": 0.0008456471142159447, + "loss": 0.78809333, + "num_input_tokens_seen": 120164032, + "router_z_loss_mlp": 0.1484375, + "step": 1452, + "time_per_iteration": 4.936378717422485 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085289, + "balance_loss_mlp": 1.05162406, + "epoch": 0.2795305886879569, + "flos": 1031445854208.0, + "grad_norm": 0.05721806240601363, + "language_loss": 0.86067116, + "learning_rate": 0.0008454219349544836, + "loss": 0.87152404, + "num_input_tokens_seen": 120246784, + "router_z_loss_mlp": 0.33691406, + "step": 1453, + "time_per_iteration": 3.336336135864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086797, + "balance_loss_mlp": 1.053442, + "epoch": 0.27972297037322047, + "flos": 606766934016.0, + "grad_norm": 0.056433536115445035, + "language_loss": 0.81829166, + "learning_rate": 0.000845196621588334, + "loss": 0.82915968, + "num_input_tokens_seen": 120318208, + "router_z_loss_mlp": 0.33374023, + "step": 1454, + "time_per_iteration": 2.7415192127227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088727, + "balance_loss_mlp": 1.05475271, + "epoch": 0.27991535205848406, + "flos": 630085948416.0, + "grad_norm": 0.05700257056170363, + "language_loss": 0.76605666, + "learning_rate": 0.0008449711742049706, + "loss": 0.77694392, + "num_input_tokens_seen": 120393248, + "router_z_loss_mlp": 0.33984375, + "step": 1455, + "time_per_iteration": 2.755082130432129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093567, + "balance_loss_mlp": 1.06035542, + "epoch": 0.2801077337437476, + "flos": 549034689024.0, + "grad_norm": 0.056412270826162, + "language_loss": 0.83750427, + "learning_rate": 0.0008447455928919196, + "loss": 0.84843993, + "num_input_tokens_seen": 120461040, + "router_z_loss_mlp": 0.33227539, + "step": 1456, + "time_per_iteration": 2.601306438446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097544, + "balance_loss_mlp": 1.06423688, + "epoch": 0.2803001154290112, + "flos": 486516989952.0, + "grad_norm": 0.08664389404831466, + "language_loss": 0.86875856, + "learning_rate": 0.0008445198777367595, + "loss": 0.87973404, + "num_input_tokens_seen": 120530400, + "router_z_loss_mlp": 0.33325195, + "step": 1457, + "time_per_iteration": 2.5534915924072266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106361, + "balance_loss_mlp": 1.07267249, + "epoch": 0.2804924971142747, + "flos": 521820693504.0, + "grad_norm": 0.060155581105879694, + "language_loss": 0.8096568, + "learning_rate": 0.0008442940288271208, + "loss": 0.82072043, + "num_input_tokens_seen": 120598304, + "router_z_loss_mlp": 0.33691406, + "step": 1458, + "time_per_iteration": 2.6646370887756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108311, + "balance_loss_mlp": 1.07459903, + "epoch": 0.2806848787995383, + "flos": 527410690560.0, + "grad_norm": 0.05492641307724046, + "language_loss": 0.86995763, + "learning_rate": 0.0008440680462506856, + "loss": 0.88104069, + "num_input_tokens_seen": 120675712, + "router_z_loss_mlp": 0.33740234, + "step": 1459, + "time_per_iteration": 2.793306589126587 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103222, + "balance_loss_mlp": 1.06927133, + "epoch": 0.2808772604848018, + "flos": 485246785536.0, + "grad_norm": 0.053370474172872176, + "language_loss": 0.86799729, + "learning_rate": 0.0008438419300951883, + "loss": 0.87902945, + "num_input_tokens_seen": 120746544, + "router_z_loss_mlp": 0.33984375, + "step": 1460, + "time_per_iteration": 2.6732945442199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098403, + "balance_loss_mlp": 1.06488156, + "epoch": 0.2810696421700654, + "flos": 617840801280.0, + "grad_norm": 0.06081455520295947, + "language_loss": 0.86599934, + "learning_rate": 0.0008436156804484148, + "loss": 0.87698334, + "num_input_tokens_seen": 120823520, + "router_z_loss_mlp": 0.33544922, + "step": 1461, + "time_per_iteration": 2.768385410308838 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087899, + "balance_loss_mlp": 1.05397177, + "epoch": 0.28126202385532895, + "flos": 454521922560.0, + "grad_norm": 0.061036272851527865, + "language_loss": 0.88221931, + "learning_rate": 0.0008433892973982031, + "loss": 0.89309829, + "num_input_tokens_seen": 120889568, + "router_z_loss_mlp": 0.33959961, + "step": 1462, + "time_per_iteration": 2.502269983291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108859, + "balance_loss_mlp": 1.0546149, + "epoch": 0.28145440554059253, + "flos": 530447284224.0, + "grad_norm": 0.06533100110399645, + "language_loss": 0.85006338, + "learning_rate": 0.0008431627810324431, + "loss": 0.86094928, + "num_input_tokens_seen": 120958480, + "router_z_loss_mlp": 0.34008789, + "step": 1463, + "time_per_iteration": 2.6481637954711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097854, + "balance_loss_mlp": 1.06344974, + "epoch": 0.2816467872258561, + "flos": 451996070400.0, + "grad_norm": 0.053948569536927254, + "language_loss": 0.81259125, + "learning_rate": 0.000842936131439076, + "loss": 0.82356977, + "num_input_tokens_seen": 121028032, + "router_z_loss_mlp": 0.34423828, + "step": 1464, + "time_per_iteration": 2.598619222640991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100723, + "balance_loss_mlp": 1.06665313, + "epoch": 0.28183916891111965, + "flos": 472464755712.0, + "grad_norm": 0.06117554261618067, + "language_loss": 0.88043475, + "learning_rate": 0.0008427093487060951, + "loss": 0.89144206, + "num_input_tokens_seen": 121099280, + "router_z_loss_mlp": 0.34106445, + "step": 1465, + "time_per_iteration": 2.611067533493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092883, + "balance_loss_mlp": 1.05917072, + "epoch": 0.28203155059638324, + "flos": 556770806784.0, + "grad_norm": 0.05001896034653533, + "language_loss": 0.84742111, + "learning_rate": 0.000842482432921545, + "loss": 0.85834992, + "num_input_tokens_seen": 121180240, + "router_z_loss_mlp": 0.33740234, + "step": 1466, + "time_per_iteration": 2.8155059814453125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109283, + "balance_loss_mlp": 1.05990458, + "epoch": 0.28222393228164677, + "flos": 416756385792.0, + "grad_norm": 0.06017010781955974, + "language_loss": 0.87132335, + "learning_rate": 0.0008422553841735225, + "loss": 0.88225162, + "num_input_tokens_seen": 121242736, + "router_z_loss_mlp": 0.3293457, + "step": 1467, + "time_per_iteration": 2.459348201751709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091932, + "balance_loss_mlp": 1.05731392, + "epoch": 0.28241631396691036, + "flos": 604629577728.0, + "grad_norm": 0.060074700521020694, + "language_loss": 0.84810078, + "learning_rate": 0.0008420282025501757, + "loss": 0.85902011, + "num_input_tokens_seen": 121319248, + "router_z_loss_mlp": 0.34643555, + "step": 1468, + "time_per_iteration": 2.7499678134918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086806, + "balance_loss_mlp": 1.05275989, + "epoch": 0.2826086956521739, + "flos": 572698529280.0, + "grad_norm": 0.05717030328031113, + "language_loss": 0.854882, + "learning_rate": 0.0008418008881397043, + "loss": 0.86575013, + "num_input_tokens_seen": 121392064, + "router_z_loss_mlp": 0.34082031, + "step": 1469, + "time_per_iteration": 2.6512861251831055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080943, + "balance_loss_mlp": 1.04677796, + "epoch": 0.2828010773374375, + "flos": 842367886848.0, + "grad_norm": 0.05184982716140645, + "language_loss": 0.82590878, + "learning_rate": 0.0008415734410303595, + "loss": 0.8367182, + "num_input_tokens_seen": 121475984, + "router_z_loss_mlp": 0.34204102, + "step": 1470, + "time_per_iteration": 3.1787467002868652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085547, + "balance_loss_mlp": 1.05214453, + "epoch": 0.28299345902270107, + "flos": 542402860032.0, + "grad_norm": 0.04590644458835405, + "language_loss": 0.90709066, + "learning_rate": 0.0008413458613104444, + "loss": 0.9179461, + "num_input_tokens_seen": 121551024, + "router_z_loss_mlp": 0.33447266, + "step": 1471, + "time_per_iteration": 2.6650707721710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092285, + "balance_loss_mlp": 1.05780995, + "epoch": 0.2831858407079646, + "flos": 571320635904.0, + "grad_norm": 0.05367648266979066, + "language_loss": 0.82737631, + "learning_rate": 0.0008411181490683129, + "loss": 0.83829916, + "num_input_tokens_seen": 121624528, + "router_z_loss_mlp": 0.34472656, + "step": 1472, + "time_per_iteration": 2.7423322200775146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104233, + "balance_loss_mlp": 1.06909025, + "epoch": 0.2833782223932282, + "flos": 763491861504.0, + "grad_norm": 0.05498194123694656, + "language_loss": 0.82467097, + "learning_rate": 0.0008408903043923707, + "loss": 0.83571333, + "num_input_tokens_seen": 121706736, + "router_z_loss_mlp": 0.35180664, + "step": 1473, + "time_per_iteration": 2.991787910461426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114011, + "balance_loss_mlp": 1.07810485, + "epoch": 0.2835706040784917, + "flos": 538793068032.0, + "grad_norm": 0.05681509946110844, + "language_loss": 0.81401253, + "learning_rate": 0.0008406623273710754, + "loss": 0.82515264, + "num_input_tokens_seen": 121773008, + "router_z_loss_mlp": 0.35913086, + "step": 1474, + "time_per_iteration": 2.58866286277771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112011, + "balance_loss_mlp": 1.07732141, + "epoch": 0.2837629857637553, + "flos": 530329420800.0, + "grad_norm": 0.06008709036576614, + "language_loss": 0.82883334, + "learning_rate": 0.0008404342180929351, + "loss": 0.83995342, + "num_input_tokens_seen": 121840016, + "router_z_loss_mlp": 0.34716797, + "step": 1475, + "time_per_iteration": 2.636383295059204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112842, + "balance_loss_mlp": 1.07834268, + "epoch": 0.28395536744901884, + "flos": 539763526656.0, + "grad_norm": 0.06514959519071023, + "language_loss": 0.81725156, + "learning_rate": 0.00084020597664651, + "loss": 0.82837999, + "num_input_tokens_seen": 121915008, + "router_z_loss_mlp": 0.34521484, + "step": 1476, + "time_per_iteration": 2.7587718963623047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106069, + "balance_loss_mlp": 1.0697813, + "epoch": 0.2841477491342824, + "flos": 573344510976.0, + "grad_norm": 0.0608139165355994, + "language_loss": 0.84204602, + "learning_rate": 0.0008399776031204111, + "loss": 0.85310674, + "num_input_tokens_seen": 121987456, + "router_z_loss_mlp": 0.36303711, + "step": 1477, + "time_per_iteration": 2.7376203536987305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097711, + "balance_loss_mlp": 1.06263912, + "epoch": 0.28434013081954596, + "flos": 571802264064.0, + "grad_norm": 0.06275845169868324, + "language_loss": 0.8026123, + "learning_rate": 0.0008397490976033009, + "loss": 0.81358939, + "num_input_tokens_seen": 122058720, + "router_z_loss_mlp": 0.35083008, + "step": 1478, + "time_per_iteration": 2.6391618251800537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0103776, + "balance_loss_mlp": 1.02412283, + "epoch": 0.28453251250480954, + "flos": 1552554832896.0, + "grad_norm": 0.016614249421738093, + "language_loss": 0.77879643, + "learning_rate": 0.000839520460183893, + "loss": 0.78917408, + "num_input_tokens_seen": 122285792, + "router_z_loss_mlp": 0.13671875, + "step": 1479, + "time_per_iteration": 4.730362176895142 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079858, + "balance_loss_mlp": 1.04550207, + "epoch": 0.28472489419007313, + "flos": 748720862208.0, + "grad_norm": 0.04873312803653651, + "language_loss": 0.85529596, + "learning_rate": 0.0008392916909509525, + "loss": 0.86609453, + "num_input_tokens_seen": 122366608, + "router_z_loss_mlp": 0.34399414, + "step": 1480, + "time_per_iteration": 3.0429892539978027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083924, + "balance_loss_mlp": 1.0495913, + "epoch": 0.28491727587533666, + "flos": 489914376192.0, + "grad_norm": 0.056617404906403615, + "language_loss": 0.85149431, + "learning_rate": 0.0008390627899932954, + "loss": 0.86233348, + "num_input_tokens_seen": 122435536, + "router_z_loss_mlp": 0.34375, + "step": 1481, + "time_per_iteration": 2.6355843544006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083301, + "balance_loss_mlp": 1.04818201, + "epoch": 0.28510965756060025, + "flos": 728671196160.0, + "grad_norm": 0.06013951169928809, + "language_loss": 0.88358414, + "learning_rate": 0.000838833757399789, + "loss": 0.89441717, + "num_input_tokens_seen": 122515584, + "router_z_loss_mlp": 0.3515625, + "step": 1482, + "time_per_iteration": 2.9198856353759766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088409, + "balance_loss_mlp": 1.05357623, + "epoch": 0.2853020392458638, + "flos": 551300083200.0, + "grad_norm": 0.06378715850004578, + "language_loss": 0.80512154, + "learning_rate": 0.0008386045932593515, + "loss": 0.81600565, + "num_input_tokens_seen": 122585552, + "router_z_loss_mlp": 0.34887695, + "step": 1483, + "time_per_iteration": 2.665919065475464 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087283, + "balance_loss_mlp": 1.05233121, + "epoch": 0.28549442093112737, + "flos": 754456425984.0, + "grad_norm": 0.06049898751226662, + "language_loss": 0.86304945, + "learning_rate": 0.0008383752976609525, + "loss": 0.87392229, + "num_input_tokens_seen": 122658928, + "router_z_loss_mlp": 0.34960938, + "step": 1484, + "time_per_iteration": 2.9113876819610596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081674, + "balance_loss_mlp": 1.04586315, + "epoch": 0.2856868026163909, + "flos": 538311439872.0, + "grad_norm": 0.05363431349597561, + "language_loss": 0.79897112, + "learning_rate": 0.0008381458706936123, + "loss": 0.80978787, + "num_input_tokens_seen": 122729056, + "router_z_loss_mlp": 0.3581543, + "step": 1485, + "time_per_iteration": 2.715182065963745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082792, + "balance_loss_mlp": 1.0470289, + "epoch": 0.2858791843016545, + "flos": 583487207424.0, + "grad_norm": 0.055785658857036256, + "language_loss": 0.8776381, + "learning_rate": 0.0008379163124464025, + "loss": 0.888466, + "num_input_tokens_seen": 122802832, + "router_z_loss_mlp": 0.35766602, + "step": 1486, + "time_per_iteration": 2.713412284851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083174, + "balance_loss_mlp": 1.04881775, + "epoch": 0.286071565986918, + "flos": 644503357440.0, + "grad_norm": 0.05967072286491994, + "language_loss": 0.76593089, + "learning_rate": 0.0008376866230084452, + "loss": 0.7767626, + "num_input_tokens_seen": 122881328, + "router_z_loss_mlp": 0.34399414, + "step": 1487, + "time_per_iteration": 2.812953472137451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091632, + "balance_loss_mlp": 1.05522537, + "epoch": 0.2862639476721816, + "flos": 491120561664.0, + "grad_norm": 0.06413337589788286, + "language_loss": 0.85965335, + "learning_rate": 0.000837456802468914, + "loss": 0.87056965, + "num_input_tokens_seen": 122949680, + "router_z_loss_mlp": 0.36401367, + "step": 1488, + "time_per_iteration": 2.5974318981170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096408, + "balance_loss_mlp": 1.06050217, + "epoch": 0.2864563293574452, + "flos": 521363796480.0, + "grad_norm": 0.06049840128310572, + "language_loss": 0.85439187, + "learning_rate": 0.0008372268509170331, + "loss": 0.86535597, + "num_input_tokens_seen": 123024736, + "router_z_loss_mlp": 0.35888672, + "step": 1489, + "time_per_iteration": 2.6646056175231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100143, + "balance_loss_mlp": 1.06478548, + "epoch": 0.2866487110427087, + "flos": 546834723840.0, + "grad_norm": 0.05582745965585505, + "language_loss": 0.84845203, + "learning_rate": 0.0008369967684420779, + "loss": 0.85945344, + "num_input_tokens_seen": 123097344, + "router_z_loss_mlp": 0.35424805, + "step": 1490, + "time_per_iteration": 2.737180471420288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094742, + "balance_loss_mlp": 1.05902719, + "epoch": 0.2868410927279723, + "flos": 481977437184.0, + "grad_norm": 0.0702351654670911, + "language_loss": 0.84684229, + "learning_rate": 0.0008367665551333736, + "loss": 0.85778964, + "num_input_tokens_seen": 123166240, + "router_z_loss_mlp": 0.35717773, + "step": 1491, + "time_per_iteration": 2.591179847717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095437, + "balance_loss_mlp": 1.05807662, + "epoch": 0.28703347441323585, + "flos": 724578365952.0, + "grad_norm": 0.0690733570245185, + "language_loss": 0.85732669, + "learning_rate": 0.0008365362110802977, + "loss": 0.86828107, + "num_input_tokens_seen": 123238160, + "router_z_loss_mlp": 0.37329102, + "step": 1492, + "time_per_iteration": 2.8586251735687256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083578, + "balance_loss_mlp": 1.04733849, + "epoch": 0.28722585609849943, + "flos": 634670581248.0, + "grad_norm": 0.059898504183233336, + "language_loss": 0.82604659, + "learning_rate": 0.0008363057363722773, + "loss": 0.83688229, + "num_input_tokens_seen": 123319504, + "router_z_loss_mlp": 0.36254883, + "step": 1493, + "time_per_iteration": 2.8491427898406982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076927, + "balance_loss_mlp": 1.04264212, + "epoch": 0.28741823778376296, + "flos": 509974216704.0, + "grad_norm": 0.05796804627405179, + "language_loss": 0.84095198, + "learning_rate": 0.0008360751310987906, + "loss": 0.85172129, + "num_input_tokens_seen": 123387008, + "router_z_loss_mlp": 0.34301758, + "step": 1494, + "time_per_iteration": 2.5735158920288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077369, + "balance_loss_mlp": 1.04294157, + "epoch": 0.28761061946902655, + "flos": 603458297856.0, + "grad_norm": 0.07368534281083228, + "language_loss": 0.85552645, + "learning_rate": 0.0008358443953493666, + "loss": 0.86630011, + "num_input_tokens_seen": 123471056, + "router_z_loss_mlp": 0.34472656, + "step": 1495, + "time_per_iteration": 2.8492400646209717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078466, + "balance_loss_mlp": 1.04260767, + "epoch": 0.28780300115429014, + "flos": 406977454080.0, + "grad_norm": 0.061458136593000166, + "language_loss": 0.88553399, + "learning_rate": 0.0008356135292135851, + "loss": 0.89631861, + "num_input_tokens_seen": 123535024, + "router_z_loss_mlp": 0.35864258, + "step": 1496, + "time_per_iteration": 2.499234676361084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109055, + "balance_loss_mlp": 1.05428672, + "epoch": 0.28799538283955367, + "flos": 374726310912.0, + "grad_norm": 0.06023187099093886, + "language_loss": 0.92244387, + "learning_rate": 0.0008353825327810758, + "loss": 0.93334937, + "num_input_tokens_seen": 123596224, + "router_z_loss_mlp": 0.36230469, + "step": 1497, + "time_per_iteration": 2.4068801403045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083025, + "balance_loss_mlp": 1.04761958, + "epoch": 0.28818776452481726, + "flos": 591645316608.0, + "grad_norm": 0.050935971597675156, + "language_loss": 0.81914794, + "learning_rate": 0.00083515140614152, + "loss": 0.82997811, + "num_input_tokens_seen": 123668640, + "router_z_loss_mlp": 0.35473633, + "step": 1498, + "time_per_iteration": 2.7172293663024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085079, + "balance_loss_mlp": 1.05012727, + "epoch": 0.2883801462100808, + "flos": 534819511296.0, + "grad_norm": 0.055380500747477406, + "language_loss": 0.86671853, + "learning_rate": 0.0008349201493846485, + "loss": 0.87756932, + "num_input_tokens_seen": 123740816, + "router_z_loss_mlp": 0.34985352, + "step": 1499, + "time_per_iteration": 2.666877508163452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088163, + "balance_loss_mlp": 1.05268669, + "epoch": 0.2885725278953444, + "flos": 479850255360.0, + "grad_norm": 0.06392675802491345, + "language_loss": 0.89344347, + "learning_rate": 0.0008346887626002432, + "loss": 0.90432513, + "num_input_tokens_seen": 123805968, + "router_z_loss_mlp": 0.35473633, + "step": 1500, + "time_per_iteration": 2.547353744506836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081111, + "balance_loss_mlp": 1.04613519, + "epoch": 0.2887649095806079, + "flos": 463798877184.0, + "grad_norm": 0.050375470508879826, + "language_loss": 0.86195928, + "learning_rate": 0.000834457245878137, + "loss": 0.87277037, + "num_input_tokens_seen": 123876576, + "router_z_loss_mlp": 0.35009766, + "step": 1501, + "time_per_iteration": 2.6108102798461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076981, + "balance_loss_mlp": 1.04317355, + "epoch": 0.2889572912658715, + "flos": 930631527936.0, + "grad_norm": 0.05668037017333152, + "language_loss": 0.81365681, + "learning_rate": 0.000834225599308212, + "loss": 0.82442665, + "num_input_tokens_seen": 123967664, + "router_z_loss_mlp": 0.33837891, + "step": 1502, + "time_per_iteration": 3.2222447395324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085232, + "balance_loss_mlp": 1.04994583, + "epoch": 0.28914967295113503, + "flos": 569848200192.0, + "grad_norm": 0.05132223508893719, + "language_loss": 0.85018057, + "learning_rate": 0.0008339938229804016, + "loss": 0.8610329, + "num_input_tokens_seen": 124039680, + "router_z_loss_mlp": 0.35327148, + "step": 1503, + "time_per_iteration": 2.698528289794922 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032992, + "balance_loss_mlp": 1.01945031, + "epoch": 0.2893420546363986, + "flos": 1485803119104.0, + "grad_norm": 0.02573157997511775, + "language_loss": 0.75434822, + "learning_rate": 0.0008337619169846895, + "loss": 0.76467812, + "num_input_tokens_seen": 124278848, + "router_z_loss_mlp": 0.13574219, + "step": 1504, + "time_per_iteration": 4.950274467468262 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083767, + "balance_loss_mlp": 1.04793239, + "epoch": 0.2895344363216622, + "flos": 469938903552.0, + "grad_norm": 0.06568085425348943, + "language_loss": 0.84119928, + "learning_rate": 0.0008335298814111094, + "loss": 0.85203701, + "num_input_tokens_seen": 124346736, + "router_z_loss_mlp": 0.35864258, + "step": 1505, + "time_per_iteration": 2.542043924331665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085418, + "balance_loss_mlp": 1.05089498, + "epoch": 0.28972681800692573, + "flos": 647909508096.0, + "grad_norm": 0.06591449016003405, + "language_loss": 0.87860626, + "learning_rate": 0.0008332977163497455, + "loss": 0.88946044, + "num_input_tokens_seen": 124420816, + "router_z_loss_mlp": 0.34570312, + "step": 1506, + "time_per_iteration": 2.810399293899536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087351, + "balance_loss_mlp": 1.05163622, + "epoch": 0.2899191996921893, + "flos": 571955033088.0, + "grad_norm": 0.054529888005095714, + "language_loss": 0.83185649, + "learning_rate": 0.0008330654218907325, + "loss": 0.84272999, + "num_input_tokens_seen": 124490480, + "router_z_loss_mlp": 0.35742188, + "step": 1507, + "time_per_iteration": 2.6968414783477783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084817, + "balance_loss_mlp": 1.04981744, + "epoch": 0.29011158137745285, + "flos": 661037773824.0, + "grad_norm": 0.1280653735040032, + "language_loss": 0.81777966, + "learning_rate": 0.0008328329981242548, + "loss": 0.82862782, + "num_input_tokens_seen": 124564960, + "router_z_loss_mlp": 0.3503418, + "step": 1508, + "time_per_iteration": 2.886396884918213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089679, + "balance_loss_mlp": 1.05441689, + "epoch": 0.29030396306271644, + "flos": 535933974528.0, + "grad_norm": 0.060234790533374126, + "language_loss": 0.87937206, + "learning_rate": 0.0008326004451405475, + "loss": 0.89026886, + "num_input_tokens_seen": 124637424, + "router_z_loss_mlp": 0.3527832, + "step": 1509, + "time_per_iteration": 2.7797772884368896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096033, + "balance_loss_mlp": 1.06148589, + "epoch": 0.29049634474798, + "flos": 511707110400.0, + "grad_norm": 0.05385470227261208, + "language_loss": 0.82548428, + "learning_rate": 0.0008323677630298957, + "loss": 0.83644462, + "num_input_tokens_seen": 124704832, + "router_z_loss_mlp": 0.34594727, + "step": 1510, + "time_per_iteration": 2.5542855262756348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093546, + "balance_loss_mlp": 1.05892766, + "epoch": 0.29068872643324356, + "flos": 613454017536.0, + "grad_norm": 0.05556182475666109, + "language_loss": 0.85001689, + "learning_rate": 0.0008321349518826345, + "loss": 0.86095232, + "num_input_tokens_seen": 124779600, + "router_z_loss_mlp": 0.34643555, + "step": 1511, + "time_per_iteration": 2.7849388122558594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093876, + "balance_loss_mlp": 1.05878115, + "epoch": 0.2908811081185071, + "flos": 546164011008.0, + "grad_norm": 0.07046084545113683, + "language_loss": 0.94823933, + "learning_rate": 0.0008319020117891491, + "loss": 0.95917809, + "num_input_tokens_seen": 124844128, + "router_z_loss_mlp": 0.35131836, + "step": 1512, + "time_per_iteration": 2.5936031341552734 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090975, + "balance_loss_mlp": 1.05485463, + "epoch": 0.2910734898037707, + "flos": 604516096512.0, + "grad_norm": 0.06307487928884016, + "language_loss": 0.87063539, + "learning_rate": 0.0008316689428398751, + "loss": 0.88154513, + "num_input_tokens_seen": 124915376, + "router_z_loss_mlp": 0.36108398, + "step": 1513, + "time_per_iteration": 2.6774067878723145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082761, + "balance_loss_mlp": 1.04792798, + "epoch": 0.29126587148903427, + "flos": 574383370752.0, + "grad_norm": 0.043578668947666564, + "language_loss": 0.88254529, + "learning_rate": 0.0008314357451252979, + "loss": 0.89337289, + "num_input_tokens_seen": 124995504, + "router_z_loss_mlp": 0.34887695, + "step": 1514, + "time_per_iteration": 2.7561941146850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086548, + "balance_loss_mlp": 1.05092812, + "epoch": 0.2914582531742978, + "flos": 570802692096.0, + "grad_norm": 0.06240160889449628, + "language_loss": 0.87564558, + "learning_rate": 0.0008312024187359527, + "loss": 0.88651109, + "num_input_tokens_seen": 125064192, + "router_z_loss_mlp": 0.35644531, + "step": 1515, + "time_per_iteration": 2.636056900024414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084088, + "balance_loss_mlp": 1.04942179, + "epoch": 0.2916506348595614, + "flos": 730523363328.0, + "grad_norm": 0.06185972429295104, + "language_loss": 0.87361014, + "learning_rate": 0.000830968963762425, + "loss": 0.88445103, + "num_input_tokens_seen": 125150560, + "router_z_loss_mlp": 0.34716797, + "step": 1516, + "time_per_iteration": 3.021732807159424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091474, + "balance_loss_mlp": 1.05528235, + "epoch": 0.2918430165448249, + "flos": 510220118016.0, + "grad_norm": 0.05583453201751925, + "language_loss": 0.83947027, + "learning_rate": 0.0008307353802953497, + "loss": 0.85038507, + "num_input_tokens_seen": 125219264, + "router_z_loss_mlp": 0.36206055, + "step": 1517, + "time_per_iteration": 2.6659955978393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100836, + "balance_loss_mlp": 1.06318951, + "epoch": 0.2920353982300885, + "flos": 630096122880.0, + "grad_norm": 0.04472517729516854, + "language_loss": 0.86110896, + "learning_rate": 0.0008305016684254125, + "loss": 0.87211728, + "num_input_tokens_seen": 125301904, + "router_z_loss_mlp": 0.37646484, + "step": 1518, + "time_per_iteration": 2.7896409034729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098986, + "balance_loss_mlp": 1.06153059, + "epoch": 0.29222777991535204, + "flos": 501411644928.0, + "grad_norm": 0.055409034097420505, + "language_loss": 0.86932153, + "learning_rate": 0.0008302678282433479, + "loss": 0.88031137, + "num_input_tokens_seen": 125367712, + "router_z_loss_mlp": 0.37451172, + "step": 1519, + "time_per_iteration": 2.585256814956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095676, + "balance_loss_mlp": 1.05891216, + "epoch": 0.2924201616006156, + "flos": 486522782208.0, + "grad_norm": 0.057505891705300044, + "language_loss": 0.85011005, + "learning_rate": 0.0008300338598399411, + "loss": 0.86106682, + "num_input_tokens_seen": 125437648, + "router_z_loss_mlp": 0.36791992, + "step": 1520, + "time_per_iteration": 2.6471352577209473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109825, + "balance_loss_mlp": 1.07210708, + "epoch": 0.2926125432858792, + "flos": 476211350016.0, + "grad_norm": 0.05302442020038178, + "language_loss": 0.9456166, + "learning_rate": 0.0008297997633060263, + "loss": 0.95671487, + "num_input_tokens_seen": 125502432, + "router_z_loss_mlp": 0.37719727, + "step": 1521, + "time_per_iteration": 2.547457695007324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106499, + "balance_loss_mlp": 1.06987774, + "epoch": 0.29280492497114274, + "flos": 676379151360.0, + "grad_norm": 0.054888704647412474, + "language_loss": 0.85310549, + "learning_rate": 0.0008295655387324883, + "loss": 0.86417043, + "num_input_tokens_seen": 125575424, + "router_z_loss_mlp": 0.36645508, + "step": 1522, + "time_per_iteration": 2.822557210922241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105716, + "balance_loss_mlp": 1.0686183, + "epoch": 0.29299730665640633, + "flos": 458175384576.0, + "grad_norm": 0.055715580232585875, + "language_loss": 0.85025144, + "learning_rate": 0.0008293311862102609, + "loss": 0.86130863, + "num_input_tokens_seen": 125639040, + "router_z_loss_mlp": 0.37084961, + "step": 1523, + "time_per_iteration": 2.5033791065216064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103076, + "balance_loss_mlp": 1.06795669, + "epoch": 0.29318968834166986, + "flos": 446343464448.0, + "grad_norm": 0.0584953499596263, + "language_loss": 0.88722956, + "learning_rate": 0.0008290967058303275, + "loss": 0.89826035, + "num_input_tokens_seen": 125701712, + "router_z_loss_mlp": 0.3515625, + "step": 1524, + "time_per_iteration": 2.5981156826019287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102477, + "balance_loss_mlp": 1.06630898, + "epoch": 0.29338207002693345, + "flos": 450085676544.0, + "grad_norm": 0.05072610752657829, + "language_loss": 0.86932707, + "learning_rate": 0.0008288620976837219, + "loss": 0.88035178, + "num_input_tokens_seen": 125765088, + "router_z_loss_mlp": 0.36181641, + "step": 1525, + "time_per_iteration": 2.522019863128662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092716, + "balance_loss_mlp": 1.05623853, + "epoch": 0.293574451712197, + "flos": 502027103232.0, + "grad_norm": 0.05230718040210392, + "language_loss": 0.83001733, + "learning_rate": 0.000828627361861527, + "loss": 0.84094453, + "num_input_tokens_seen": 125831328, + "router_z_loss_mlp": 0.36474609, + "step": 1526, + "time_per_iteration": 2.559201955795288 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085779, + "balance_loss_mlp": 1.05051661, + "epoch": 0.29376683339746057, + "flos": 696158184960.0, + "grad_norm": 0.071892180297548, + "language_loss": 0.8465147, + "learning_rate": 0.0008283924984548752, + "loss": 0.85737246, + "num_input_tokens_seen": 125903664, + "router_z_loss_mlp": 0.3527832, + "step": 1527, + "time_per_iteration": 2.8108816146850586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079347, + "balance_loss_mlp": 1.04494321, + "epoch": 0.2939592150827241, + "flos": 478353088512.0, + "grad_norm": 0.05128355551395112, + "language_loss": 0.85087478, + "learning_rate": 0.0008281575075549485, + "loss": 0.86166823, + "num_input_tokens_seen": 125971856, + "router_z_loss_mlp": 0.34399414, + "step": 1528, + "time_per_iteration": 2.576814889907837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01051452, + "balance_loss_mlp": 1.04057992, + "epoch": 0.2941515967679877, + "flos": 1484482042368.0, + "grad_norm": 0.031732851839211505, + "language_loss": 0.77352691, + "learning_rate": 0.000827922389252979, + "loss": 0.7840414, + "num_input_tokens_seen": 126183968, + "router_z_loss_mlp": 0.10888672, + "step": 1529, + "time_per_iteration": 4.662023067474365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089321, + "balance_loss_mlp": 1.0547502, + "epoch": 0.2943439784532513, + "flos": 673848916992.0, + "grad_norm": 0.06453398347295829, + "language_loss": 0.90086716, + "learning_rate": 0.0008276871436402469, + "loss": 0.91176039, + "num_input_tokens_seen": 126254448, + "router_z_loss_mlp": 0.34619141, + "step": 1530, + "time_per_iteration": 2.795783758163452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097742, + "balance_loss_mlp": 1.06460166, + "epoch": 0.2945363601385148, + "flos": 576031896576.0, + "grad_norm": 0.05195467848041957, + "language_loss": 0.87790835, + "learning_rate": 0.000827451770808083, + "loss": 0.88888574, + "num_input_tokens_seen": 126328208, + "router_z_loss_mlp": 0.33154297, + "step": 1531, + "time_per_iteration": 2.6522414684295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100953, + "balance_loss_mlp": 1.06628692, + "epoch": 0.2947287418237784, + "flos": 480416251392.0, + "grad_norm": 0.05572078055736918, + "language_loss": 0.83276248, + "learning_rate": 0.0008272162708478674, + "loss": 0.84377199, + "num_input_tokens_seen": 126396464, + "router_z_loss_mlp": 0.34692383, + "step": 1532, + "time_per_iteration": 2.5960874557495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098518, + "balance_loss_mlp": 1.06459141, + "epoch": 0.2949211235090419, + "flos": 557917355520.0, + "grad_norm": 0.05232404820193651, + "language_loss": 0.86136103, + "learning_rate": 0.000826980643851029, + "loss": 0.87234622, + "num_input_tokens_seen": 126468960, + "router_z_loss_mlp": 0.33959961, + "step": 1533, + "time_per_iteration": 2.671867609024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106353, + "balance_loss_mlp": 1.07147205, + "epoch": 0.2951135051943055, + "flos": 483646311936.0, + "grad_norm": 0.06650262295584625, + "language_loss": 0.84864676, + "learning_rate": 0.0008267448899090464, + "loss": 0.85971034, + "num_input_tokens_seen": 126536496, + "router_z_loss_mlp": 0.34887695, + "step": 1534, + "time_per_iteration": 2.5133543014526367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095921, + "balance_loss_mlp": 1.0604682, + "epoch": 0.29530588687956905, + "flos": 550015322112.0, + "grad_norm": 0.05711998360561463, + "language_loss": 0.80980158, + "learning_rate": 0.0008265090091134473, + "loss": 0.82076073, + "num_input_tokens_seen": 126614048, + "router_z_loss_mlp": 0.35473633, + "step": 1535, + "time_per_iteration": 2.8528778553009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085228, + "balance_loss_mlp": 1.05072904, + "epoch": 0.29549826856483263, + "flos": 672731481600.0, + "grad_norm": 0.047870597747086484, + "language_loss": 0.80150926, + "learning_rate": 0.0008262730015558088, + "loss": 0.8123616, + "num_input_tokens_seen": 126697248, + "router_z_loss_mlp": 0.34521484, + "step": 1536, + "time_per_iteration": 2.8849382400512695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076376, + "balance_loss_mlp": 1.04192495, + "epoch": 0.29569065025009617, + "flos": 764300786688.0, + "grad_norm": 0.06331525049863725, + "language_loss": 0.82269859, + "learning_rate": 0.0008260368673277574, + "loss": 0.8334623, + "num_input_tokens_seen": 126782496, + "router_z_loss_mlp": 0.34472656, + "step": 1537, + "time_per_iteration": 3.12172269821167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078873, + "balance_loss_mlp": 1.04480314, + "epoch": 0.29588303193535975, + "flos": 543398049792.0, + "grad_norm": 0.05107262607685598, + "language_loss": 0.84019077, + "learning_rate": 0.0008258006065209682, + "loss": 0.85097957, + "num_input_tokens_seen": 126857328, + "router_z_loss_mlp": 0.34106445, + "step": 1538, + "time_per_iteration": 2.7388381958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082306, + "balance_loss_mlp": 1.04744971, + "epoch": 0.29607541362062334, + "flos": 596648968704.0, + "grad_norm": 0.06469434822608365, + "language_loss": 0.80634302, + "learning_rate": 0.0008255642192271657, + "loss": 0.81716609, + "num_input_tokens_seen": 126932608, + "router_z_loss_mlp": 0.34863281, + "step": 1539, + "time_per_iteration": 2.7957324981689453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082434, + "balance_loss_mlp": 1.04774427, + "epoch": 0.29626779530588687, + "flos": 609588149760.0, + "grad_norm": 0.06097977692176942, + "language_loss": 0.83830953, + "learning_rate": 0.0008253277055381241, + "loss": 0.84913385, + "num_input_tokens_seen": 127008928, + "router_z_loss_mlp": 0.34741211, + "step": 1540, + "time_per_iteration": 2.8428521156311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085866, + "balance_loss_mlp": 1.05146217, + "epoch": 0.29646017699115046, + "flos": 867050237952.0, + "grad_norm": 0.06407432486539091, + "language_loss": 0.8580029, + "learning_rate": 0.0008250910655456658, + "loss": 0.8688615, + "num_input_tokens_seen": 127097104, + "router_z_loss_mlp": 0.34448242, + "step": 1541, + "time_per_iteration": 3.1741185188293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081587, + "balance_loss_mlp": 1.04696846, + "epoch": 0.296652558676414, + "flos": 495616444416.0, + "grad_norm": 0.06683547404256097, + "language_loss": 0.83703458, + "learning_rate": 0.0008248542993416625, + "loss": 0.84785044, + "num_input_tokens_seen": 127165264, + "router_z_loss_mlp": 0.34643555, + "step": 1542, + "time_per_iteration": 2.5634429454803467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083482, + "balance_loss_mlp": 1.04914963, + "epoch": 0.2968449403616776, + "flos": 571275555840.0, + "grad_norm": 0.054805025189504364, + "language_loss": 0.83645159, + "learning_rate": 0.0008246174070180352, + "loss": 0.84728634, + "num_input_tokens_seen": 127238992, + "router_z_loss_mlp": 0.34375, + "step": 1543, + "time_per_iteration": 2.664029121398926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108698, + "balance_loss_mlp": 1.05226684, + "epoch": 0.2970373220469411, + "flos": 793799115264.0, + "grad_norm": 0.06369286414713611, + "language_loss": 0.84087443, + "learning_rate": 0.0008243803886667537, + "loss": 0.85174423, + "num_input_tokens_seen": 127328160, + "router_z_loss_mlp": 0.34765625, + "step": 1544, + "time_per_iteration": 3.129185199737549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082817, + "balance_loss_mlp": 1.04793644, + "epoch": 0.2972297037322047, + "flos": 660736617984.0, + "grad_norm": 0.0569938777400986, + "language_loss": 0.79051471, + "learning_rate": 0.0008241432443798364, + "loss": 0.80134284, + "num_input_tokens_seen": 127407328, + "router_z_loss_mlp": 0.34936523, + "step": 1545, + "time_per_iteration": 2.7968478202819824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076425, + "balance_loss_mlp": 1.04225969, + "epoch": 0.29742208541746823, + "flos": 596849789952.0, + "grad_norm": 0.05185676674228935, + "language_loss": 0.85634965, + "learning_rate": 0.0008239059742493512, + "loss": 0.86711389, + "num_input_tokens_seen": 127477136, + "router_z_loss_mlp": 0.34204102, + "step": 1546, + "time_per_iteration": 2.730803966522217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079044, + "balance_loss_mlp": 1.0448308, + "epoch": 0.2976144671027318, + "flos": 769519816704.0, + "grad_norm": 0.049935350225070424, + "language_loss": 0.87424839, + "learning_rate": 0.0008236685783674142, + "loss": 0.88503873, + "num_input_tokens_seen": 127565680, + "router_z_loss_mlp": 0.34228516, + "step": 1547, + "time_per_iteration": 3.0735998153686523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060573, + "balance_loss_mlp": 1.04903388, + "epoch": 0.2978068487879954, + "flos": 1483980065280.0, + "grad_norm": 0.022808758650826922, + "language_loss": 0.76221192, + "learning_rate": 0.0008234310568261911, + "loss": 0.77281767, + "num_input_tokens_seen": 127791584, + "router_z_loss_mlp": 0.11523438, + "step": 1548, + "time_per_iteration": 4.902673959732056 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088892, + "balance_loss_mlp": 1.05460715, + "epoch": 0.29799923047325894, + "flos": 475079357952.0, + "grad_norm": 0.07696298762455249, + "language_loss": 0.82568306, + "learning_rate": 0.0008231934097178955, + "loss": 0.83657193, + "num_input_tokens_seen": 127860112, + "router_z_loss_mlp": 0.34326172, + "step": 1549, + "time_per_iteration": 2.59600567817688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087503, + "balance_loss_mlp": 1.05173981, + "epoch": 0.2981916121585225, + "flos": 759464460288.0, + "grad_norm": 0.05308820200633048, + "language_loss": 0.8525809, + "learning_rate": 0.0008229556371347903, + "loss": 0.86345589, + "num_input_tokens_seen": 127938752, + "router_z_loss_mlp": 0.35791016, + "step": 1550, + "time_per_iteration": 2.955467939376831 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080247, + "balance_loss_mlp": 1.04593909, + "epoch": 0.29838399384378606, + "flos": 874642351104.0, + "grad_norm": 0.058723621398699785, + "language_loss": 0.79088616, + "learning_rate": 0.0008227177391691874, + "loss": 0.80168855, + "num_input_tokens_seen": 128022192, + "router_z_loss_mlp": 0.34350586, + "step": 1551, + "time_per_iteration": 3.1204521656036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084967, + "balance_loss_mlp": 1.05001473, + "epoch": 0.29857637552904964, + "flos": 579389995008.0, + "grad_norm": 0.060980844602782615, + "language_loss": 0.89576113, + "learning_rate": 0.0008224797159134463, + "loss": 0.90661073, + "num_input_tokens_seen": 128097776, + "router_z_loss_mlp": 0.34985352, + "step": 1552, + "time_per_iteration": 2.7535347938537598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075057, + "balance_loss_mlp": 1.04132128, + "epoch": 0.2987687572143132, + "flos": 836048950272.0, + "grad_norm": 0.05791718796165568, + "language_loss": 0.83571118, + "learning_rate": 0.0008222415674599765, + "loss": 0.84646177, + "num_input_tokens_seen": 128179888, + "router_z_loss_mlp": 0.33764648, + "step": 1553, + "time_per_iteration": 3.0609707832336426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077849, + "balance_loss_mlp": 1.0417521, + "epoch": 0.29896113889957676, + "flos": 566800022016.0, + "grad_norm": 0.05477323920870417, + "language_loss": 0.83255476, + "learning_rate": 0.0008220032939012349, + "loss": 0.84333324, + "num_input_tokens_seen": 128251152, + "router_z_loss_mlp": 0.36108398, + "step": 1554, + "time_per_iteration": 2.6683521270751953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077555, + "balance_loss_mlp": 1.04310393, + "epoch": 0.29915352058484035, + "flos": 498370669056.0, + "grad_norm": 0.049159177960894807, + "language_loss": 0.87956095, + "learning_rate": 0.0008217648953297277, + "loss": 0.89033645, + "num_input_tokens_seen": 128327600, + "router_z_loss_mlp": 0.34472656, + "step": 1555, + "time_per_iteration": 2.82114315032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109245, + "balance_loss_mlp": 1.05711639, + "epoch": 0.2993459022701039, + "flos": 591837373440.0, + "grad_norm": 0.06210935096260163, + "language_loss": 0.7799179, + "learning_rate": 0.0008215263718380095, + "loss": 0.79084241, + "num_input_tokens_seen": 128398432, + "router_z_loss_mlp": 0.35327148, + "step": 1556, + "time_per_iteration": 2.6806485652923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104132, + "balance_loss_mlp": 1.06815481, + "epoch": 0.29953828395536747, + "flos": 572107802112.0, + "grad_norm": 0.051501670996139066, + "language_loss": 0.8437115, + "learning_rate": 0.0008212877235186833, + "loss": 0.8547529, + "num_input_tokens_seen": 128469696, + "router_z_loss_mlp": 0.36010742, + "step": 1557, + "time_per_iteration": 2.706531286239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075275, + "balance_loss_mlp": 1.06321061, + "epoch": 0.299730665640631, + "flos": 1503855051264.0, + "grad_norm": 0.03618665962020262, + "language_loss": 0.77737558, + "learning_rate": 0.0008210489504644005, + "loss": 0.78812838, + "num_input_tokens_seen": 128698560, + "router_z_loss_mlp": 0.12060547, + "step": 1558, + "time_per_iteration": 4.914030313491821 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102225, + "balance_loss_mlp": 1.06641483, + "epoch": 0.2999230473258946, + "flos": 513538928640.0, + "grad_norm": 0.06717328469529676, + "language_loss": 0.80777293, + "learning_rate": 0.0008208100527678611, + "loss": 0.8187952, + "num_input_tokens_seen": 128765952, + "router_z_loss_mlp": 0.3581543, + "step": 1559, + "time_per_iteration": 2.5862650871276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097511, + "balance_loss_mlp": 1.06294012, + "epoch": 0.3001154290111581, + "flos": 834128381952.0, + "grad_norm": 0.05731533213860712, + "language_loss": 0.78337204, + "learning_rate": 0.0008205710305218135, + "loss": 0.79434717, + "num_input_tokens_seen": 128840048, + "router_z_loss_mlp": 0.34594727, + "step": 1560, + "time_per_iteration": 3.00581693649292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094947, + "balance_loss_mlp": 1.06149733, + "epoch": 0.3003078106964217, + "flos": 556485617664.0, + "grad_norm": 0.051151635719759364, + "language_loss": 0.89917201, + "learning_rate": 0.0008203318838190541, + "loss": 0.9101215, + "num_input_tokens_seen": 128912496, + "router_z_loss_mlp": 0.3347168, + "step": 1561, + "time_per_iteration": 2.730187177658081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087932, + "balance_loss_mlp": 1.05345702, + "epoch": 0.30050019238168524, + "flos": 525897556992.0, + "grad_norm": 0.07455466191279551, + "language_loss": 0.85053575, + "learning_rate": 0.0008200926127524281, + "loss": 0.86141509, + "num_input_tokens_seen": 128980624, + "router_z_loss_mlp": 0.34521484, + "step": 1562, + "time_per_iteration": 2.6252634525299072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082824, + "balance_loss_mlp": 1.04837239, + "epoch": 0.3006925740669488, + "flos": 577582907904.0, + "grad_norm": 0.08578868432126639, + "language_loss": 0.83193934, + "learning_rate": 0.0008198532174148289, + "loss": 0.84276754, + "num_input_tokens_seen": 129050576, + "router_z_loss_mlp": 0.3449707, + "step": 1563, + "time_per_iteration": 2.784132957458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01010668, + "balance_loss_mlp": 0.99912882, + "epoch": 0.3008849557522124, + "flos": 1489408528896.0, + "grad_norm": 0.006418694176289122, + "language_loss": 0.8068617, + "learning_rate": 0.0008196136978991977, + "loss": 0.81696838, + "num_input_tokens_seen": 129278880, + "router_z_loss_mlp": 0.11523438, + "step": 1564, + "time_per_iteration": 4.826026678085327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079569, + "balance_loss_mlp": 1.04607105, + "epoch": 0.30107733743747594, + "flos": 509565371904.0, + "grad_norm": 0.057361266050022765, + "language_loss": 0.88701093, + "learning_rate": 0.0008193740542985244, + "loss": 0.89780664, + "num_input_tokens_seen": 129346560, + "router_z_loss_mlp": 0.33520508, + "step": 1565, + "time_per_iteration": 2.5685722827911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082334, + "balance_loss_mlp": 1.04881263, + "epoch": 0.30126971912273953, + "flos": 587425858560.0, + "grad_norm": 0.055549771382925904, + "language_loss": 0.86598676, + "learning_rate": 0.0008191342867058467, + "loss": 0.87681007, + "num_input_tokens_seen": 129420448, + "router_z_loss_mlp": 0.33520508, + "step": 1566, + "time_per_iteration": 2.7413527965545654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088116, + "balance_loss_mlp": 1.05423677, + "epoch": 0.30146210080800306, + "flos": 601822918656.0, + "grad_norm": 0.054174391750340056, + "language_loss": 0.83411789, + "learning_rate": 0.0008188943952142509, + "loss": 0.84499902, + "num_input_tokens_seen": 129494032, + "router_z_loss_mlp": 0.33911133, + "step": 1567, + "time_per_iteration": 2.816777229309082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098385, + "balance_loss_mlp": 1.06376624, + "epoch": 0.30165448249326665, + "flos": 917424686592.0, + "grad_norm": 0.057308899380469513, + "language_loss": 0.81973398, + "learning_rate": 0.0008186543799168711, + "loss": 0.8307178, + "num_input_tokens_seen": 129569088, + "router_z_loss_mlp": 0.34643555, + "step": 1568, + "time_per_iteration": 3.138439655303955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094575, + "balance_loss_mlp": 1.060076, + "epoch": 0.3018468641785302, + "flos": 776953368576.0, + "grad_norm": 0.06314470525088989, + "language_loss": 0.88671768, + "learning_rate": 0.0008184142409068892, + "loss": 0.89766341, + "num_input_tokens_seen": 129647968, + "router_z_loss_mlp": 0.34545898, + "step": 1569, + "time_per_iteration": 3.0061678886413574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104914, + "balance_loss_mlp": 1.07134473, + "epoch": 0.30203924586379377, + "flos": 522101500416.0, + "grad_norm": 0.05000282823150535, + "language_loss": 0.86630476, + "learning_rate": 0.000818173978277536, + "loss": 0.87735385, + "num_input_tokens_seen": 129718928, + "router_z_loss_mlp": 0.3359375, + "step": 1570, + "time_per_iteration": 2.7171432971954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101367, + "balance_loss_mlp": 1.06779718, + "epoch": 0.3022316275490573, + "flos": 524288318976.0, + "grad_norm": 0.052630401262377564, + "language_loss": 0.83781934, + "learning_rate": 0.000817933592122089, + "loss": 0.84883296, + "num_input_tokens_seen": 129790128, + "router_z_loss_mlp": 0.3359375, + "step": 1571, + "time_per_iteration": 2.7346580028533936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105149, + "balance_loss_mlp": 1.0710789, + "epoch": 0.3024240092343209, + "flos": 479672755200.0, + "grad_norm": 0.05357670269103591, + "language_loss": 0.83451247, + "learning_rate": 0.0008176930825338749, + "loss": 0.84556395, + "num_input_tokens_seen": 129857536, + "router_z_loss_mlp": 0.34106445, + "step": 1572, + "time_per_iteration": 2.5449459552764893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095582, + "balance_loss_mlp": 1.06110692, + "epoch": 0.3026163909195845, + "flos": 686901579264.0, + "grad_norm": 0.06283664606524127, + "language_loss": 0.8873198, + "learning_rate": 0.0008174524496062679, + "loss": 0.89827561, + "num_input_tokens_seen": 129931440, + "router_z_loss_mlp": 0.3449707, + "step": 1573, + "time_per_iteration": 2.8826043605804443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108869, + "balance_loss_mlp": 1.05380964, + "epoch": 0.302808772604848, + "flos": 542654553600.0, + "grad_norm": 0.05929060654319276, + "language_loss": 0.85444844, + "learning_rate": 0.0008172116934326894, + "loss": 0.86533535, + "num_input_tokens_seen": 130005200, + "router_z_loss_mlp": 0.34912109, + "step": 1574, + "time_per_iteration": 2.7539572715759277 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088334, + "balance_loss_mlp": 1.05435979, + "epoch": 0.3030011542901116, + "flos": 474852395520.0, + "grad_norm": 0.051325587648683973, + "language_loss": 0.87683225, + "learning_rate": 0.0008169708141066097, + "loss": 0.88771558, + "num_input_tokens_seen": 130069136, + "router_z_loss_mlp": 0.34008789, + "step": 1575, + "time_per_iteration": 2.5635225772857666 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086472, + "balance_loss_mlp": 1.05199683, + "epoch": 0.30319353597537513, + "flos": 481233940992.0, + "grad_norm": 0.06106098638193731, + "language_loss": 0.90820259, + "learning_rate": 0.0008167298117215465, + "loss": 0.91906732, + "num_input_tokens_seen": 130135456, + "router_z_loss_mlp": 0.3449707, + "step": 1576, + "time_per_iteration": 2.5388035774230957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087257, + "balance_loss_mlp": 1.05242443, + "epoch": 0.3033859176606387, + "flos": 704455916544.0, + "grad_norm": 0.06728579874610481, + "language_loss": 0.88300574, + "learning_rate": 0.0008164886863710649, + "loss": 0.89387834, + "num_input_tokens_seen": 130213712, + "router_z_loss_mlp": 0.34887695, + "step": 1577, + "time_per_iteration": 2.8935675621032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088541, + "balance_loss_mlp": 1.05554342, + "epoch": 0.30357829934590225, + "flos": 764344456704.0, + "grad_norm": 0.04642698643554312, + "language_loss": 0.86113924, + "learning_rate": 0.0008162474381487783, + "loss": 0.87202466, + "num_input_tokens_seen": 130290928, + "router_z_loss_mlp": 0.33007812, + "step": 1578, + "time_per_iteration": 3.0151257514953613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088208, + "balance_loss_mlp": 1.05401909, + "epoch": 0.30377068103116583, + "flos": 532082663424.0, + "grad_norm": 0.05691489249418783, + "language_loss": 0.84894794, + "learning_rate": 0.0008160060671483475, + "loss": 0.85983002, + "num_input_tokens_seen": 130362672, + "router_z_loss_mlp": 0.34228516, + "step": 1579, + "time_per_iteration": 2.6575984954833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097191, + "balance_loss_mlp": 1.06338358, + "epoch": 0.3039630627164294, + "flos": 509934928896.0, + "grad_norm": 0.07240450604386858, + "language_loss": 0.83520651, + "learning_rate": 0.0008157645734634809, + "loss": 0.84617841, + "num_input_tokens_seen": 130428848, + "router_z_loss_mlp": 0.33837891, + "step": 1580, + "time_per_iteration": 2.5869438648223877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01061274, + "balance_loss_mlp": 1.04992568, + "epoch": 0.30415544440169295, + "flos": 1505206803456.0, + "grad_norm": 0.030998653754179664, + "language_loss": 0.76896489, + "learning_rate": 0.000815522957187935, + "loss": 0.77957761, + "num_input_tokens_seen": 130665440, + "router_z_loss_mlp": 0.11328125, + "step": 1581, + "time_per_iteration": 4.907174348831177 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01027749, + "balance_loss_mlp": 1.01606703, + "epoch": 0.30434782608695654, + "flos": 1457976637440.0, + "grad_norm": 0.012214555664928241, + "language_loss": 0.73214495, + "learning_rate": 0.0008152812184155132, + "loss": 0.74242246, + "num_input_tokens_seen": 130895248, + "router_z_loss_mlp": 0.11669922, + "step": 1582, + "time_per_iteration": 4.8717710971832275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102415, + "balance_loss_mlp": 1.0685122, + "epoch": 0.3045402077722201, + "flos": 482312088576.0, + "grad_norm": 0.05813255519406619, + "language_loss": 0.83851862, + "learning_rate": 0.000815039357240067, + "loss": 0.84954274, + "num_input_tokens_seen": 130964544, + "router_z_loss_mlp": 0.33935547, + "step": 1583, + "time_per_iteration": 2.640148401260376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102551, + "balance_loss_mlp": 1.06879056, + "epoch": 0.30473258945748366, + "flos": 543220549632.0, + "grad_norm": 0.06312099992380371, + "language_loss": 0.85312426, + "learning_rate": 0.0008147973737554952, + "loss": 0.86414981, + "num_input_tokens_seen": 131041744, + "router_z_loss_mlp": 0.33789062, + "step": 1584, + "time_per_iteration": 2.772367000579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098103, + "balance_loss_mlp": 1.06443787, + "epoch": 0.3049249711427472, + "flos": 566789847552.0, + "grad_norm": 0.054268296030043885, + "language_loss": 0.85613728, + "learning_rate": 0.000814555268055744, + "loss": 0.86711836, + "num_input_tokens_seen": 131108864, + "router_z_loss_mlp": 0.33691406, + "step": 1585, + "time_per_iteration": 2.616687536239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109794, + "balance_loss_mlp": 1.06441879, + "epoch": 0.3051173528280108, + "flos": 527970894336.0, + "grad_norm": 0.05527556644311566, + "language_loss": 0.87648201, + "learning_rate": 0.0008143130402348073, + "loss": 0.88746148, + "num_input_tokens_seen": 131181104, + "router_z_loss_mlp": 0.33544922, + "step": 1586, + "time_per_iteration": 2.635103940963745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094771, + "balance_loss_mlp": 1.06141627, + "epoch": 0.3053097345132743, + "flos": 586097427456.0, + "grad_norm": 0.052385807505719764, + "language_loss": 0.79520649, + "learning_rate": 0.0008140706903867265, + "loss": 0.80615419, + "num_input_tokens_seen": 131258704, + "router_z_loss_mlp": 0.33349609, + "step": 1587, + "time_per_iteration": 2.7922940254211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087687, + "balance_loss_mlp": 1.05263984, + "epoch": 0.3055021161985379, + "flos": 606810604032.0, + "grad_norm": 0.054380951058352583, + "language_loss": 0.90043247, + "learning_rate": 0.0008138282186055897, + "loss": 0.9113093, + "num_input_tokens_seen": 131325712, + "router_z_loss_mlp": 0.35058594, + "step": 1588, + "time_per_iteration": 2.683783769607544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085545, + "balance_loss_mlp": 1.05290556, + "epoch": 0.3056944978838015, + "flos": 573594794496.0, + "grad_norm": 0.05235756550943364, + "language_loss": 0.8193745, + "learning_rate": 0.0008135856249855331, + "loss": 0.83023, + "num_input_tokens_seen": 131397568, + "router_z_loss_mlp": 0.32641602, + "step": 1589, + "time_per_iteration": 2.6717309951782227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081523, + "balance_loss_mlp": 1.04900289, + "epoch": 0.305886879569065, + "flos": 633640485888.0, + "grad_norm": 0.06284243799371535, + "language_loss": 0.89691997, + "learning_rate": 0.0008133429096207398, + "loss": 0.90773523, + "num_input_tokens_seen": 131467632, + "router_z_loss_mlp": 0.32519531, + "step": 1590, + "time_per_iteration": 2.757962465286255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01037916, + "balance_loss_mlp": 1.02561319, + "epoch": 0.3060792612543286, + "flos": 1368227414016.0, + "grad_norm": 0.023218914608202516, + "language_loss": 0.75312257, + "learning_rate": 0.0008131000726054403, + "loss": 0.76350176, + "num_input_tokens_seen": 131702224, + "router_z_loss_mlp": 0.12304688, + "step": 1591, + "time_per_iteration": 4.927033185958862 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078048, + "balance_loss_mlp": 1.0450511, + "epoch": 0.30627164293959214, + "flos": 518290887168.0, + "grad_norm": 0.05132667013942606, + "language_loss": 0.86601979, + "learning_rate": 0.0008128571140339123, + "loss": 0.87680024, + "num_input_tokens_seen": 131774608, + "router_z_loss_mlp": 0.33007812, + "step": 1592, + "time_per_iteration": 2.627272367477417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075862, + "balance_loss_mlp": 1.0423162, + "epoch": 0.3064640246248557, + "flos": 455354168832.0, + "grad_norm": 0.054345541641725725, + "language_loss": 0.87405455, + "learning_rate": 0.0008126140340004805, + "loss": 0.88481319, + "num_input_tokens_seen": 131841216, + "router_z_loss_mlp": 0.33569336, + "step": 1593, + "time_per_iteration": 2.5047686100006104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076578, + "balance_loss_mlp": 1.04355717, + "epoch": 0.30665640631011926, + "flos": 849718480896.0, + "grad_norm": 0.04925367115496714, + "language_loss": 0.82262254, + "learning_rate": 0.0008123708325995172, + "loss": 0.83338827, + "num_input_tokens_seen": 131937584, + "router_z_loss_mlp": 0.33032227, + "step": 1594, + "time_per_iteration": 3.1693196296691895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107923, + "balance_loss_mlp": 1.04582715, + "epoch": 0.30684878799538284, + "flos": 757996406784.0, + "grad_norm": 0.04977841797679214, + "language_loss": 0.79901791, + "learning_rate": 0.0008121275099254414, + "loss": 0.80981016, + "num_input_tokens_seen": 132012656, + "router_z_loss_mlp": 0.33422852, + "step": 1595, + "time_per_iteration": 2.9197185039520264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108089, + "balance_loss_mlp": 1.04758275, + "epoch": 0.3070411696806464, + "flos": 517320428544.0, + "grad_norm": 0.05488318824662342, + "language_loss": 0.88300943, + "learning_rate": 0.0008118840660727194, + "loss": 0.89381832, + "num_input_tokens_seen": 132083728, + "router_z_loss_mlp": 0.33325195, + "step": 1596, + "time_per_iteration": 2.6452150344848633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079954, + "balance_loss_mlp": 1.04788685, + "epoch": 0.30723355136590996, + "flos": 843883992576.0, + "grad_norm": 0.05612425557740203, + "language_loss": 0.87403214, + "learning_rate": 0.0008116405011358644, + "loss": 0.88483167, + "num_input_tokens_seen": 132170896, + "router_z_loss_mlp": 0.32055664, + "step": 1597, + "time_per_iteration": 3.135666608810425 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084185, + "balance_loss_mlp": 1.05092525, + "epoch": 0.30742593305117355, + "flos": 465905710080.0, + "grad_norm": 0.05391343675647517, + "language_loss": 0.80005342, + "learning_rate": 0.0008113968152094369, + "loss": 0.81089526, + "num_input_tokens_seen": 132234592, + "router_z_loss_mlp": 0.33276367, + "step": 1598, + "time_per_iteration": 2.5122313499450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076054, + "balance_loss_mlp": 1.04331923, + "epoch": 0.3076183147364371, + "flos": 686286120960.0, + "grad_norm": 0.04979397496165333, + "language_loss": 0.82305032, + "learning_rate": 0.0008111530083880438, + "loss": 0.83381081, + "num_input_tokens_seen": 132314720, + "router_z_loss_mlp": 0.32739258, + "step": 1599, + "time_per_iteration": 2.8883755207061768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072089, + "balance_loss_mlp": 1.03949702, + "epoch": 0.30781069642170067, + "flos": 613729032192.0, + "grad_norm": 0.059000164712882774, + "language_loss": 0.86657357, + "learning_rate": 0.0008109090807663399, + "loss": 0.87729448, + "num_input_tokens_seen": 132388768, + "router_z_loss_mlp": 0.32592773, + "step": 1600, + "time_per_iteration": 2.7799928188323975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075122, + "balance_loss_mlp": 1.04260147, + "epoch": 0.3080030781069642, + "flos": 590021521920.0, + "grad_norm": 0.046450828420206536, + "language_loss": 0.88735926, + "learning_rate": 0.0008106650324390257, + "loss": 0.89811045, + "num_input_tokens_seen": 132472544, + "router_z_loss_mlp": 0.32519531, + "step": 1601, + "time_per_iteration": 2.7887444496154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071489, + "balance_loss_mlp": 1.03913534, + "epoch": 0.3081954597922278, + "flos": 562353601536.0, + "grad_norm": 0.06865077604181559, + "language_loss": 0.81335884, + "learning_rate": 0.0008104208635008493, + "loss": 0.82407373, + "num_input_tokens_seen": 132541968, + "router_z_loss_mlp": 0.32348633, + "step": 1602, + "time_per_iteration": 2.6526358127593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077173, + "balance_loss_mlp": 1.04448628, + "epoch": 0.3083878414774913, + "flos": 447599112192.0, + "grad_norm": 0.053973671264543166, + "language_loss": 0.81925714, + "learning_rate": 0.0008101765740466058, + "loss": 0.83002889, + "num_input_tokens_seen": 132606976, + "router_z_loss_mlp": 0.3269043, + "step": 1603, + "time_per_iteration": 2.5337142944335938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073512, + "balance_loss_mlp": 1.04020488, + "epoch": 0.3085802231627549, + "flos": 493297205760.0, + "grad_norm": 0.05670542728571842, + "language_loss": 0.84135199, + "learning_rate": 0.0008099321641711364, + "loss": 0.85208714, + "num_input_tokens_seen": 132677984, + "router_z_loss_mlp": 0.33325195, + "step": 1604, + "time_per_iteration": 2.6616737842559814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071402, + "balance_loss_mlp": 1.03804755, + "epoch": 0.3087726048480185, + "flos": 487437986304.0, + "grad_norm": 0.0517354770696361, + "language_loss": 0.8343811, + "learning_rate": 0.0008096876339693295, + "loss": 0.8450951, + "num_input_tokens_seen": 132749136, + "router_z_loss_mlp": 0.33374023, + "step": 1605, + "time_per_iteration": 2.6034042835235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078765, + "balance_loss_mlp": 1.04412317, + "epoch": 0.308964986533282, + "flos": 730265877504.0, + "grad_norm": 0.0630488444124333, + "language_loss": 0.8123467, + "learning_rate": 0.0008094429835361206, + "loss": 0.8231343, + "num_input_tokens_seen": 132823824, + "router_z_loss_mlp": 0.34667969, + "step": 1606, + "time_per_iteration": 2.9442811012268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107155, + "balance_loss_mlp": 1.03788495, + "epoch": 0.3091573682185456, + "flos": 605131554816.0, + "grad_norm": 0.0490228515497239, + "language_loss": 0.85833865, + "learning_rate": 0.0008091982129664908, + "loss": 0.8690542, + "num_input_tokens_seen": 132895936, + "router_z_loss_mlp": 0.33691406, + "step": 1607, + "time_per_iteration": 2.734976053237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077455, + "balance_loss_mlp": 1.04290783, + "epoch": 0.30934974990380915, + "flos": 460081396224.0, + "grad_norm": 0.04772079658934369, + "language_loss": 0.82646394, + "learning_rate": 0.0008089533223554687, + "loss": 0.83723843, + "num_input_tokens_seen": 132968960, + "router_z_loss_mlp": 0.34594727, + "step": 1608, + "time_per_iteration": 2.756741762161255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080172, + "balance_loss_mlp": 1.04669785, + "epoch": 0.30954213158907273, + "flos": 553142075904.0, + "grad_norm": 0.05499274022240881, + "language_loss": 0.8525604, + "learning_rate": 0.0008087083117981294, + "loss": 0.86336207, + "num_input_tokens_seen": 133048448, + "router_z_loss_mlp": 0.33496094, + "step": 1609, + "time_per_iteration": 2.9062788486480713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081142, + "balance_loss_mlp": 1.04676199, + "epoch": 0.30973451327433627, + "flos": 552776901120.0, + "grad_norm": 0.0512798930400947, + "language_loss": 0.87996054, + "learning_rate": 0.0008084631813895943, + "loss": 0.89077199, + "num_input_tokens_seen": 133121680, + "router_z_loss_mlp": 0.34375, + "step": 1610, + "time_per_iteration": 2.789893627166748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079299, + "balance_loss_mlp": 1.04575384, + "epoch": 0.30992689495959985, + "flos": 565430893056.0, + "grad_norm": 0.07403274033744815, + "language_loss": 0.83632123, + "learning_rate": 0.0008082179312250315, + "loss": 0.84711421, + "num_input_tokens_seen": 133190176, + "router_z_loss_mlp": 0.33544922, + "step": 1611, + "time_per_iteration": 2.713533878326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01040085, + "balance_loss_mlp": 1.02864099, + "epoch": 0.3101192766448634, + "flos": 1441621131264.0, + "grad_norm": 0.023040649208851512, + "language_loss": 0.79855847, + "learning_rate": 0.0008079725613996555, + "loss": 0.80895925, + "num_input_tokens_seen": 133420512, + "router_z_loss_mlp": 0.11425781, + "step": 1612, + "time_per_iteration": 4.866560459136963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035595, + "balance_loss_mlp": 1.02448523, + "epoch": 0.31031165833012697, + "flos": 1531086575616.0, + "grad_norm": 0.021256554447441355, + "language_loss": 0.76629329, + "learning_rate": 0.0008077270720087273, + "loss": 0.77664924, + "num_input_tokens_seen": 133651984, + "router_z_loss_mlp": 0.11132812, + "step": 1613, + "time_per_iteration": 5.01593279838562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010886, + "balance_loss_mlp": 1.05483997, + "epoch": 0.31050404001539056, + "flos": 991534196736.0, + "grad_norm": 0.06253960188626659, + "language_loss": 0.81937206, + "learning_rate": 0.0008074814631475545, + "loss": 0.83025801, + "num_input_tokens_seen": 133741648, + "router_z_loss_mlp": 0.33789062, + "step": 1614, + "time_per_iteration": 3.3154871463775635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092644, + "balance_loss_mlp": 1.05940843, + "epoch": 0.3106964217006541, + "flos": 445748355072.0, + "grad_norm": 0.0719929788966035, + "language_loss": 0.78655052, + "learning_rate": 0.0008072357349114907, + "loss": 0.79747701, + "num_input_tokens_seen": 133813344, + "router_z_loss_mlp": 0.33251953, + "step": 1615, + "time_per_iteration": 2.6783502101898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084302, + "balance_loss_mlp": 1.05063736, + "epoch": 0.3108888033859177, + "flos": 510259405824.0, + "grad_norm": 0.06269338504314155, + "language_loss": 0.88523185, + "learning_rate": 0.0008069898873959363, + "loss": 0.89607489, + "num_input_tokens_seen": 133884192, + "router_z_loss_mlp": 0.33691406, + "step": 1616, + "time_per_iteration": 2.6805779933929443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092165, + "balance_loss_mlp": 1.05952573, + "epoch": 0.3110811850711812, + "flos": 520471913472.0, + "grad_norm": 0.06669389997650658, + "language_loss": 0.85964763, + "learning_rate": 0.0008067439206963375, + "loss": 0.87056935, + "num_input_tokens_seen": 133954848, + "router_z_loss_mlp": 0.32641602, + "step": 1617, + "time_per_iteration": 2.6084542274475098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091143, + "balance_loss_mlp": 1.05797851, + "epoch": 0.3112735667564448, + "flos": 686085299712.0, + "grad_norm": 0.06020913179087489, + "language_loss": 0.86049557, + "learning_rate": 0.0008064978349081873, + "loss": 0.87140703, + "num_input_tokens_seen": 134031824, + "router_z_loss_mlp": 0.33178711, + "step": 1618, + "time_per_iteration": 2.9622116088867188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089777, + "balance_loss_mlp": 1.05554032, + "epoch": 0.31146594844170833, + "flos": 532786871808.0, + "grad_norm": 0.04562356821057988, + "language_loss": 0.86218596, + "learning_rate": 0.0008062516301270245, + "loss": 0.87308377, + "num_input_tokens_seen": 134104480, + "router_z_loss_mlp": 0.3425293, + "step": 1619, + "time_per_iteration": 2.691589593887329 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091127, + "balance_loss_mlp": 1.0575099, + "epoch": 0.3116583301269719, + "flos": 679187220480.0, + "grad_norm": 0.05429224886242875, + "language_loss": 0.88343138, + "learning_rate": 0.0008060053064484343, + "loss": 0.89434266, + "num_input_tokens_seen": 134185632, + "router_z_loss_mlp": 0.33642578, + "step": 1620, + "time_per_iteration": 2.936244487762451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096127, + "balance_loss_mlp": 1.06277251, + "epoch": 0.31185071181223545, + "flos": 585855908352.0, + "grad_norm": 0.05040245512912965, + "language_loss": 0.85009742, + "learning_rate": 0.0008057588639680482, + "loss": 0.86105865, + "num_input_tokens_seen": 134261600, + "router_z_loss_mlp": 0.33374023, + "step": 1621, + "time_per_iteration": 2.7633163928985596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107807, + "balance_loss_mlp": 1.07309282, + "epoch": 0.31204309349749904, + "flos": 725090517504.0, + "grad_norm": 0.06801147163116106, + "language_loss": 0.82624507, + "learning_rate": 0.0008055123027815434, + "loss": 0.83732307, + "num_input_tokens_seen": 134334368, + "router_z_loss_mlp": 0.34741211, + "step": 1622, + "time_per_iteration": 2.946943521499634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105084, + "balance_loss_mlp": 1.07156253, + "epoch": 0.3122354751827626, + "flos": 576558604800.0, + "grad_norm": 0.0611005921730787, + "language_loss": 0.85109818, + "learning_rate": 0.0008052656229846436, + "loss": 0.862149, + "num_input_tokens_seen": 134403824, + "router_z_loss_mlp": 0.33544922, + "step": 1623, + "time_per_iteration": 2.6431145668029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106208, + "balance_loss_mlp": 1.07106483, + "epoch": 0.31242785686802615, + "flos": 575672514048.0, + "grad_norm": 0.055122717603047884, + "language_loss": 0.90674621, + "learning_rate": 0.0008050188246731182, + "loss": 0.91780829, + "num_input_tokens_seen": 134471296, + "router_z_loss_mlp": 0.35180664, + "step": 1624, + "time_per_iteration": 2.6666738986968994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101645, + "balance_loss_mlp": 1.06745625, + "epoch": 0.31262023855328974, + "flos": 736490271744.0, + "grad_norm": 0.05430344032768667, + "language_loss": 0.81962687, + "learning_rate": 0.0008047719079427834, + "loss": 0.8306433, + "num_input_tokens_seen": 134551360, + "router_z_loss_mlp": 0.34204102, + "step": 1625, + "time_per_iteration": 2.978775978088379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095181, + "balance_loss_mlp": 1.07791936, + "epoch": 0.3128126202385533, + "flos": 1558395113472.0, + "grad_norm": 0.034550759669135796, + "language_loss": 0.74351704, + "learning_rate": 0.0008045248728895, + "loss": 0.75446886, + "num_input_tokens_seen": 134761328, + "router_z_loss_mlp": 0.17285156, + "step": 1626, + "time_per_iteration": 4.800370931625366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109181, + "balance_loss_mlp": 1.05695319, + "epoch": 0.31300500192381686, + "flos": 514666538496.0, + "grad_norm": 0.04752817769408696, + "language_loss": 0.86259782, + "learning_rate": 0.0008042777196091757, + "loss": 0.87351596, + "num_input_tokens_seen": 134833136, + "router_z_loss_mlp": 0.34863281, + "step": 1627, + "time_per_iteration": 2.695350408554077 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088728, + "balance_loss_mlp": 1.05301261, + "epoch": 0.3131973836090804, + "flos": 526370420736.0, + "grad_norm": 0.06407391520506579, + "language_loss": 0.8214981, + "learning_rate": 0.0008040304481977643, + "loss": 0.83238542, + "num_input_tokens_seen": 134904352, + "router_z_loss_mlp": 0.35742188, + "step": 1628, + "time_per_iteration": 2.6652393341064453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108342, + "balance_loss_mlp": 1.05030346, + "epoch": 0.313389765294344, + "flos": 822473961984.0, + "grad_norm": 0.08346342139557943, + "language_loss": 0.86950874, + "learning_rate": 0.0008037830587512649, + "loss": 0.88034296, + "num_input_tokens_seen": 134984880, + "router_z_loss_mlp": 0.33129883, + "step": 1629, + "time_per_iteration": 3.0668327808380127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090413, + "balance_loss_mlp": 1.05651021, + "epoch": 0.31358214697960757, + "flos": 393604697088.0, + "grad_norm": 0.061409761762948115, + "language_loss": 0.78720629, + "learning_rate": 0.0008035355513657224, + "loss": 0.79811049, + "num_input_tokens_seen": 135047456, + "router_z_loss_mlp": 0.33935547, + "step": 1630, + "time_per_iteration": 2.5013740062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087918, + "balance_loss_mlp": 1.05449188, + "epoch": 0.3137745286648711, + "flos": 571611617280.0, + "grad_norm": 0.049199842100191564, + "language_loss": 0.93020999, + "learning_rate": 0.0008032879261372279, + "loss": 0.94108921, + "num_input_tokens_seen": 135124256, + "router_z_loss_mlp": 0.33447266, + "step": 1631, + "time_per_iteration": 2.8559622764587402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088241, + "balance_loss_mlp": 1.07612944, + "epoch": 0.3139669103501347, + "flos": 1497614690304.0, + "grad_norm": 0.04267228885339989, + "language_loss": 0.79635841, + "learning_rate": 0.0008030401831619178, + "loss": 0.80724084, + "num_input_tokens_seen": 135353024, + "router_z_loss_mlp": 0.12109375, + "step": 1632, + "time_per_iteration": 5.726024627685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081932, + "balance_loss_mlp": 1.04886341, + "epoch": 0.3141592920353982, + "flos": 525090041856.0, + "grad_norm": 0.04986838794009694, + "language_loss": 0.87459773, + "learning_rate": 0.0008027923225359748, + "loss": 0.88541704, + "num_input_tokens_seen": 135422464, + "router_z_loss_mlp": 0.33081055, + "step": 1633, + "time_per_iteration": 2.599775791168213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078913, + "balance_loss_mlp": 1.04465246, + "epoch": 0.3143516737206618, + "flos": 592989714432.0, + "grad_norm": 0.05680374588643473, + "language_loss": 0.8835839, + "learning_rate": 0.0008025443443556267, + "loss": 0.89437306, + "num_input_tokens_seen": 135490928, + "router_z_loss_mlp": 0.34301758, + "step": 1634, + "time_per_iteration": 2.7439024448394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010804, + "balance_loss_mlp": 1.04776073, + "epoch": 0.31454405540592534, + "flos": 648034573824.0, + "grad_norm": 0.04764849369773053, + "language_loss": 0.88161099, + "learning_rate": 0.000802296248717147, + "loss": 0.89241499, + "num_input_tokens_seen": 135576288, + "router_z_loss_mlp": 0.32641602, + "step": 1635, + "time_per_iteration": 2.902290105819702 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082457, + "balance_loss_mlp": 1.04850602, + "epoch": 0.3147364370911889, + "flos": 642543501312.0, + "grad_norm": 0.05380775409858787, + "language_loss": 0.79150212, + "learning_rate": 0.0008020480357168554, + "loss": 0.80232668, + "num_input_tokens_seen": 135652320, + "router_z_loss_mlp": 0.33984375, + "step": 1636, + "time_per_iteration": 2.7940564155578613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107486, + "balance_loss_mlp": 1.04176795, + "epoch": 0.31492881877645246, + "flos": 471607778304.0, + "grad_norm": 0.05509564816324918, + "language_loss": 0.88341308, + "learning_rate": 0.0008017997054511165, + "loss": 0.89416164, + "num_input_tokens_seen": 135719632, + "router_z_loss_mlp": 0.33105469, + "step": 1637, + "time_per_iteration": 2.596212148666382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075188, + "balance_loss_mlp": 1.04157114, + "epoch": 0.31512120046171604, + "flos": 629135838720.0, + "grad_norm": 0.0536589952194777, + "language_loss": 0.8549943, + "learning_rate": 0.0008015512580163407, + "loss": 0.86574614, + "num_input_tokens_seen": 135796544, + "router_z_loss_mlp": 0.33642578, + "step": 1638, + "time_per_iteration": 2.763343334197998 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107569, + "balance_loss_mlp": 1.0416913, + "epoch": 0.31531358214697963, + "flos": 703460726784.0, + "grad_norm": 0.0636877441873346, + "language_loss": 0.80888116, + "learning_rate": 0.0008013026935089838, + "loss": 0.81963813, + "num_input_tokens_seen": 135871344, + "router_z_loss_mlp": 0.34033203, + "step": 1639, + "time_per_iteration": 2.9786083698272705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070933, + "balance_loss_mlp": 1.03798366, + "epoch": 0.31550596383224316, + "flos": 572275127808.0, + "grad_norm": 0.055086353977466425, + "language_loss": 0.83909047, + "learning_rate": 0.0008010540120255472, + "loss": 0.84979975, + "num_input_tokens_seen": 135944320, + "router_z_loss_mlp": 0.32958984, + "step": 1640, + "time_per_iteration": 2.666520357131958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075633, + "balance_loss_mlp": 1.04196858, + "epoch": 0.31569834551750675, + "flos": 658047822336.0, + "grad_norm": 0.06483249406864507, + "language_loss": 0.86052895, + "learning_rate": 0.0008008052136625774, + "loss": 0.8712852, + "num_input_tokens_seen": 136019456, + "router_z_loss_mlp": 0.33691406, + "step": 1641, + "time_per_iteration": 2.8062589168548584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078165, + "balance_loss_mlp": 1.04407096, + "epoch": 0.3158907272027703, + "flos": 566002681344.0, + "grad_norm": 0.05792040128516231, + "language_loss": 0.86837387, + "learning_rate": 0.0008005562985166666, + "loss": 0.87915552, + "num_input_tokens_seen": 136091232, + "router_z_loss_mlp": 0.34130859, + "step": 1642, + "time_per_iteration": 2.6996512413024902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081029, + "balance_loss_mlp": 1.04576707, + "epoch": 0.31608310888803387, + "flos": 536622216192.0, + "grad_norm": 0.04642534602938139, + "language_loss": 0.84936821, + "learning_rate": 0.0008003072666844524, + "loss": 0.86017853, + "num_input_tokens_seen": 136165088, + "router_z_loss_mlp": 0.35302734, + "step": 1643, + "time_per_iteration": 2.6999776363372803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078239, + "balance_loss_mlp": 1.04292917, + "epoch": 0.3162754905732974, + "flos": 486428239872.0, + "grad_norm": 0.08271259063406261, + "language_loss": 0.82613683, + "learning_rate": 0.0008000581182626173, + "loss": 0.83691919, + "num_input_tokens_seen": 136230368, + "router_z_loss_mlp": 0.35302734, + "step": 1644, + "time_per_iteration": 2.541093111038208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082389, + "balance_loss_mlp": 1.04893875, + "epoch": 0.316467872258561, + "flos": 529792538112.0, + "grad_norm": 0.058359985905672214, + "language_loss": 0.86275887, + "learning_rate": 0.0007998088533478894, + "loss": 0.87358278, + "num_input_tokens_seen": 136302512, + "router_z_loss_mlp": 0.33447266, + "step": 1645, + "time_per_iteration": 2.641402006149292 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077931, + "balance_loss_mlp": 1.04309845, + "epoch": 0.3166602539438245, + "flos": 443197771776.0, + "grad_norm": 0.07387441321187599, + "language_loss": 0.84062803, + "learning_rate": 0.000799559472037042, + "loss": 0.85140741, + "num_input_tokens_seen": 136368064, + "router_z_loss_mlp": 0.34887695, + "step": 1646, + "time_per_iteration": 2.5274438858032227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076892, + "balance_loss_mlp": 1.04222584, + "epoch": 0.3168526356290881, + "flos": 645513103872.0, + "grad_norm": 0.053861363144643716, + "language_loss": 0.87875295, + "learning_rate": 0.0007993099744268932, + "loss": 0.8895219, + "num_input_tokens_seen": 136451520, + "router_z_loss_mlp": 0.34716797, + "step": 1647, + "time_per_iteration": 2.8893649578094482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070585, + "balance_loss_mlp": 1.03646684, + "epoch": 0.3170450173143517, + "flos": 585889403904.0, + "grad_norm": 0.05841982976759713, + "language_loss": 0.87792766, + "learning_rate": 0.000799060360614307, + "loss": 0.88863349, + "num_input_tokens_seen": 136521184, + "router_z_loss_mlp": 0.34155273, + "step": 1648, + "time_per_iteration": 2.6867480278015137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076826, + "balance_loss_mlp": 1.04273248, + "epoch": 0.3172373989996152, + "flos": 826763231232.0, + "grad_norm": 0.05654214693871822, + "language_loss": 0.83848637, + "learning_rate": 0.0007988106306961917, + "loss": 0.84925467, + "num_input_tokens_seen": 136612592, + "router_z_loss_mlp": 0.34130859, + "step": 1649, + "time_per_iteration": 3.1321003437042236 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080036, + "balance_loss_mlp": 1.04577541, + "epoch": 0.3174297806848788, + "flos": 527153204736.0, + "grad_norm": 0.060794493166337976, + "language_loss": 0.84529203, + "learning_rate": 0.0007985607847695014, + "loss": 0.85609239, + "num_input_tokens_seen": 136684336, + "router_z_loss_mlp": 0.34301758, + "step": 1650, + "time_per_iteration": 2.6306049823760986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081772, + "balance_loss_mlp": 1.04851258, + "epoch": 0.31762216237014235, + "flos": 712855544832.0, + "grad_norm": 0.05325998456044798, + "language_loss": 0.82638443, + "learning_rate": 0.0007983108229312345, + "loss": 0.83720207, + "num_input_tokens_seen": 136766400, + "router_z_loss_mlp": 0.33276367, + "step": 1651, + "time_per_iteration": 2.909571647644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108965, + "balance_loss_mlp": 1.05567503, + "epoch": 0.31781454405540593, + "flos": 483567736320.0, + "grad_norm": 0.0653784528473409, + "language_loss": 0.86672306, + "learning_rate": 0.0007980607452784351, + "loss": 0.87761962, + "num_input_tokens_seen": 136834016, + "router_z_loss_mlp": 0.33984375, + "step": 1652, + "time_per_iteration": 2.5339765548706055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100389, + "balance_loss_mlp": 1.06639075, + "epoch": 0.31800692574066947, + "flos": 548483249664.0, + "grad_norm": 0.0685029555550019, + "language_loss": 0.90562367, + "learning_rate": 0.0007978105519081919, + "loss": 0.91662765, + "num_input_tokens_seen": 136906288, + "router_z_loss_mlp": 0.34008789, + "step": 1653, + "time_per_iteration": 2.6916213035583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096542, + "balance_loss_mlp": 1.06213784, + "epoch": 0.31819930742593305, + "flos": 516640951296.0, + "grad_norm": 0.07941193091019123, + "language_loss": 0.87969935, + "learning_rate": 0.0007975602429176385, + "loss": 0.89066482, + "num_input_tokens_seen": 136972416, + "router_z_loss_mlp": 0.34423828, + "step": 1654, + "time_per_iteration": 2.5997297763824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100835, + "balance_loss_mlp": 1.06695616, + "epoch": 0.31839168911119664, + "flos": 455748456960.0, + "grad_norm": 0.07129171690745044, + "language_loss": 0.81582803, + "learning_rate": 0.0007973098184039536, + "loss": 0.82683635, + "num_input_tokens_seen": 137044576, + "router_z_loss_mlp": 0.33911133, + "step": 1655, + "time_per_iteration": 2.654914140701294 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096724, + "balance_loss_mlp": 1.06284511, + "epoch": 0.3185840707964602, + "flos": 625719513600.0, + "grad_norm": 0.05658637496419385, + "language_loss": 0.86710656, + "learning_rate": 0.0007970592784643602, + "loss": 0.87807381, + "num_input_tokens_seen": 137125120, + "router_z_loss_mlp": 0.33911133, + "step": 1656, + "time_per_iteration": 2.846390962600708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090807, + "balance_loss_mlp": 1.05719042, + "epoch": 0.31877645248172376, + "flos": 567213249024.0, + "grad_norm": 0.058346793379709355, + "language_loss": 0.85032123, + "learning_rate": 0.0007968086231961272, + "loss": 0.8612293, + "num_input_tokens_seen": 137195344, + "router_z_loss_mlp": 0.33642578, + "step": 1657, + "time_per_iteration": 2.652986526489258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094073, + "balance_loss_mlp": 1.0593828, + "epoch": 0.3189688341669873, + "flos": 489338205696.0, + "grad_norm": 0.08644842740903268, + "language_loss": 0.836254, + "learning_rate": 0.0007965578526965671, + "loss": 0.84719473, + "num_input_tokens_seen": 137261040, + "router_z_loss_mlp": 0.34741211, + "step": 1658, + "time_per_iteration": 2.5607872009277344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092531, + "balance_loss_mlp": 1.05860353, + "epoch": 0.3191612158522509, + "flos": 575948938752.0, + "grad_norm": 0.04707712809776705, + "language_loss": 0.86020696, + "learning_rate": 0.0007963069670630377, + "loss": 0.87113225, + "num_input_tokens_seen": 137334400, + "router_z_loss_mlp": 0.33959961, + "step": 1659, + "time_per_iteration": 2.7435247898101807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097687, + "balance_loss_mlp": 1.06447566, + "epoch": 0.3193535975375144, + "flos": 537867689472.0, + "grad_norm": 0.062321727217464123, + "language_loss": 0.87956834, + "learning_rate": 0.0007960559663929416, + "loss": 0.89054519, + "num_input_tokens_seen": 137405344, + "router_z_loss_mlp": 0.33227539, + "step": 1660, + "time_per_iteration": 2.6282846927642822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096682, + "balance_loss_mlp": 1.06265998, + "epoch": 0.319545979222778, + "flos": 733954245120.0, + "grad_norm": 0.07201541894751945, + "language_loss": 0.87465358, + "learning_rate": 0.0007958048507837259, + "loss": 0.88562042, + "num_input_tokens_seen": 137486016, + "router_z_loss_mlp": 0.34057617, + "step": 1661, + "time_per_iteration": 2.9250974655151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099132, + "balance_loss_mlp": 1.0647999, + "epoch": 0.31973836090804153, + "flos": 764136433152.0, + "grad_norm": 0.0721917610669121, + "language_loss": 0.87230003, + "learning_rate": 0.0007955536203328822, + "loss": 0.8832913, + "num_input_tokens_seen": 137562304, + "router_z_loss_mlp": 0.34375, + "step": 1662, + "time_per_iteration": 2.899735450744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109485, + "balance_loss_mlp": 1.06109047, + "epoch": 0.3199307425933051, + "flos": 560252560896.0, + "grad_norm": 0.06666532975578916, + "language_loss": 0.83308822, + "learning_rate": 0.0007953022751379469, + "loss": 0.84403676, + "num_input_tokens_seen": 137639248, + "router_z_loss_mlp": 0.33789062, + "step": 1663, + "time_per_iteration": 2.7743418216705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093375, + "balance_loss_mlp": 1.05899549, + "epoch": 0.3201231242785687, + "flos": 751019751936.0, + "grad_norm": 0.058114271957014456, + "language_loss": 0.81677037, + "learning_rate": 0.000795050815296501, + "loss": 0.82770407, + "num_input_tokens_seen": 137718256, + "router_z_loss_mlp": 0.34399414, + "step": 1664, + "time_per_iteration": 2.9620323181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091229, + "balance_loss_mlp": 1.05768323, + "epoch": 0.32031550596383224, + "flos": 496157709312.0, + "grad_norm": 0.061791342299560625, + "language_loss": 0.93274921, + "learning_rate": 0.0007947992409061695, + "loss": 0.94366151, + "num_input_tokens_seen": 137785216, + "router_z_loss_mlp": 0.33569336, + "step": 1665, + "time_per_iteration": 2.6097099781036377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083934, + "balance_loss_mlp": 1.05022132, + "epoch": 0.3205078876490958, + "flos": 731294562816.0, + "grad_norm": 0.05133774923717053, + "language_loss": 0.86471802, + "learning_rate": 0.0007945475520646226, + "loss": 0.8755573, + "num_input_tokens_seen": 137863424, + "router_z_loss_mlp": 0.33740234, + "step": 1666, + "time_per_iteration": 2.9224231243133545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088823, + "balance_loss_mlp": 1.05487204, + "epoch": 0.32070026933435936, + "flos": 549177283584.0, + "grad_norm": 0.1345109768982335, + "language_loss": 0.8496111, + "learning_rate": 0.0007942957488695743, + "loss": 0.86049932, + "num_input_tokens_seen": 137930384, + "router_z_loss_mlp": 0.33959961, + "step": 1667, + "time_per_iteration": 2.6267666816711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086537, + "balance_loss_mlp": 1.05265749, + "epoch": 0.32089265101962294, + "flos": 744949536768.0, + "grad_norm": 0.061316479944915916, + "language_loss": 0.80963373, + "learning_rate": 0.0007940438314187833, + "loss": 0.82049918, + "num_input_tokens_seen": 138017200, + "router_z_loss_mlp": 0.33886719, + "step": 1668, + "time_per_iteration": 3.00421142578125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089646, + "balance_loss_mlp": 1.05638647, + "epoch": 0.3210850327048865, + "flos": 493937395200.0, + "grad_norm": 0.05864654089818211, + "language_loss": 0.80047274, + "learning_rate": 0.0007937917998100529, + "loss": 0.81136918, + "num_input_tokens_seen": 138084048, + "router_z_loss_mlp": 0.33276367, + "step": 1669, + "time_per_iteration": 2.607917070388794 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100772, + "balance_loss_mlp": 1.06610548, + "epoch": 0.32127741439015006, + "flos": 530383265280.0, + "grad_norm": 0.060159342011431034, + "language_loss": 0.78680766, + "learning_rate": 0.0007935396541412302, + "loss": 0.79781532, + "num_input_tokens_seen": 138153280, + "router_z_loss_mlp": 0.34692383, + "step": 1670, + "time_per_iteration": 2.6022346019744873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108678, + "balance_loss_mlp": 1.07458389, + "epoch": 0.3214697960754136, + "flos": 500948955648.0, + "grad_norm": 0.07085567852213893, + "language_loss": 0.85879421, + "learning_rate": 0.0007932873945102068, + "loss": 0.86988097, + "num_input_tokens_seen": 138222320, + "router_z_loss_mlp": 0.34130859, + "step": 1671, + "time_per_iteration": 2.581815719604492 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120776, + "balance_loss_mlp": 1.10942781, + "epoch": 0.3216621777606772, + "flos": 1382579394048.0, + "grad_norm": 0.04969555951860313, + "language_loss": 0.75761777, + "learning_rate": 0.0007930350210149188, + "loss": 0.76882553, + "num_input_tokens_seen": 138449488, + "router_z_loss_mlp": 0.11328125, + "step": 1672, + "time_per_iteration": 4.821724891662598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106649, + "balance_loss_mlp": 1.07193518, + "epoch": 0.32185455944594077, + "flos": 571260999168.0, + "grad_norm": 0.05896773993357689, + "language_loss": 0.86527109, + "learning_rate": 0.0007927825337533461, + "loss": 0.87633765, + "num_input_tokens_seen": 138522496, + "router_z_loss_mlp": 0.34765625, + "step": 1673, + "time_per_iteration": 2.6640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105888, + "balance_loss_mlp": 1.07184219, + "epoch": 0.3220469411312043, + "flos": 543652715520.0, + "grad_norm": 0.06618360944756078, + "language_loss": 0.84761298, + "learning_rate": 0.0007925299328235131, + "loss": 0.85867184, + "num_input_tokens_seen": 138590096, + "router_z_loss_mlp": 0.34057617, + "step": 1674, + "time_per_iteration": 2.6524405479431152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102416, + "balance_loss_mlp": 1.06705832, + "epoch": 0.3222393228164679, + "flos": 490884834816.0, + "grad_norm": 0.05872681692102293, + "language_loss": 0.85148364, + "learning_rate": 0.000792277218323488, + "loss": 0.86250782, + "num_input_tokens_seen": 138658224, + "router_z_loss_mlp": 0.35424805, + "step": 1675, + "time_per_iteration": 2.557460069656372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100116, + "balance_loss_mlp": 1.06523526, + "epoch": 0.3224317045017314, + "flos": 490145720832.0, + "grad_norm": 0.05188137415598196, + "language_loss": 0.84647608, + "learning_rate": 0.0007920243903513833, + "loss": 0.85747719, + "num_input_tokens_seen": 138722864, + "router_z_loss_mlp": 0.34912109, + "step": 1676, + "time_per_iteration": 2.5208208560943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092752, + "balance_loss_mlp": 1.05813313, + "epoch": 0.322624086186995, + "flos": 575505188352.0, + "grad_norm": 0.06429192544800656, + "language_loss": 0.83992624, + "learning_rate": 0.0007917714490053556, + "loss": 0.8508538, + "num_input_tokens_seen": 138791472, + "router_z_loss_mlp": 0.34667969, + "step": 1677, + "time_per_iteration": 2.653686761856079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109123, + "balance_loss_mlp": 1.05668271, + "epoch": 0.32281646787225854, + "flos": 628974305280.0, + "grad_norm": 0.048890211607645416, + "language_loss": 0.86094737, + "learning_rate": 0.0007915183943836055, + "loss": 0.87185967, + "num_input_tokens_seen": 138873424, + "router_z_loss_mlp": 0.34594727, + "step": 1678, + "time_per_iteration": 2.852612018585205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083818, + "balance_loss_mlp": 1.04950905, + "epoch": 0.3230088495575221, + "flos": 781036024320.0, + "grad_norm": 0.05620908679364121, + "language_loss": 0.83880055, + "learning_rate": 0.0007912652265843773, + "loss": 0.8496387, + "num_input_tokens_seen": 138956880, + "router_z_loss_mlp": 0.34350586, + "step": 1679, + "time_per_iteration": 3.006805419921875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077767, + "balance_loss_mlp": 1.0433867, + "epoch": 0.3232012312427857, + "flos": 535839432192.0, + "grad_norm": 0.04762836982551939, + "language_loss": 0.81587136, + "learning_rate": 0.0007910119457059597, + "loss": 0.82664907, + "num_input_tokens_seen": 139031296, + "router_z_loss_mlp": 0.34423828, + "step": 1680, + "time_per_iteration": 2.6930737495422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076411, + "balance_loss_mlp": 1.04129148, + "epoch": 0.32339361292804925, + "flos": 704515553280.0, + "grad_norm": 0.06110281418881611, + "language_loss": 0.80031025, + "learning_rate": 0.0007907585518466849, + "loss": 0.81107438, + "num_input_tokens_seen": 139109776, + "router_z_loss_mlp": 0.35180664, + "step": 1681, + "time_per_iteration": 2.940950870513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081204, + "balance_loss_mlp": 1.04603744, + "epoch": 0.32358599461331283, + "flos": 452099377152.0, + "grad_norm": 0.0474614445796137, + "language_loss": 0.90124965, + "learning_rate": 0.000790505045104929, + "loss": 0.91206169, + "num_input_tokens_seen": 139174736, + "router_z_loss_mlp": 0.35205078, + "step": 1682, + "time_per_iteration": 2.4919469356536865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078264, + "balance_loss_mlp": 1.0435977, + "epoch": 0.32377837629857636, + "flos": 600597794304.0, + "grad_norm": 0.057051782898604, + "language_loss": 0.86989701, + "learning_rate": 0.0007902514255791125, + "loss": 0.88067961, + "num_input_tokens_seen": 139252064, + "router_z_loss_mlp": 0.34692383, + "step": 1683, + "time_per_iteration": 2.7545859813690186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108269, + "balance_loss_mlp": 1.04721308, + "epoch": 0.32397075798383995, + "flos": 807180636672.0, + "grad_norm": 0.05145240385219177, + "language_loss": 0.87981123, + "learning_rate": 0.0007899976933676986, + "loss": 0.89063811, + "num_input_tokens_seen": 139333328, + "router_z_loss_mlp": 0.35498047, + "step": 1684, + "time_per_iteration": 2.97807240486145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081365, + "balance_loss_mlp": 1.04638934, + "epoch": 0.3241631396691035, + "flos": 601414073856.0, + "grad_norm": 0.06429290680378846, + "language_loss": 0.8767072, + "learning_rate": 0.0007897438485691955, + "loss": 0.88752091, + "num_input_tokens_seen": 139400976, + "router_z_loss_mlp": 0.3503418, + "step": 1685, + "time_per_iteration": 2.704326868057251 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083548, + "balance_loss_mlp": 1.04826176, + "epoch": 0.32435552135436707, + "flos": 473980861440.0, + "grad_norm": 0.058364951070402814, + "language_loss": 0.82023847, + "learning_rate": 0.0007894898912821542, + "loss": 0.831074, + "num_input_tokens_seen": 139465664, + "router_z_loss_mlp": 0.3527832, + "step": 1686, + "time_per_iteration": 2.5206680297851562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076257, + "balance_loss_mlp": 1.04178166, + "epoch": 0.3245479030396306, + "flos": 537824019456.0, + "grad_norm": 0.04476181031616706, + "language_loss": 0.86661267, + "learning_rate": 0.0007892358216051695, + "loss": 0.87737525, + "num_input_tokens_seen": 139541984, + "router_z_loss_mlp": 0.3449707, + "step": 1687, + "time_per_iteration": 2.7332026958465576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075747, + "balance_loss_mlp": 1.04108071, + "epoch": 0.3247402847248942, + "flos": 547394927616.0, + "grad_norm": 0.05643246072623682, + "language_loss": 0.92275292, + "learning_rate": 0.0007889816396368803, + "loss": 0.93351042, + "num_input_tokens_seen": 139607408, + "router_z_loss_mlp": 0.34692383, + "step": 1688, + "time_per_iteration": 2.6158432960510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077832, + "balance_loss_mlp": 1.04388082, + "epoch": 0.3249326664101578, + "flos": 377941814784.0, + "grad_norm": 0.04953960067471088, + "language_loss": 0.85575634, + "learning_rate": 0.0007887273454759687, + "loss": 0.86653465, + "num_input_tokens_seen": 139670000, + "router_z_loss_mlp": 0.33984375, + "step": 1689, + "time_per_iteration": 2.4958317279815674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075156, + "balance_loss_mlp": 1.04051399, + "epoch": 0.3251250480954213, + "flos": 527818125312.0, + "grad_norm": 0.050587956688220255, + "language_loss": 0.82717729, + "learning_rate": 0.0007884729392211603, + "loss": 0.83792883, + "num_input_tokens_seen": 139739872, + "router_z_loss_mlp": 0.34692383, + "step": 1690, + "time_per_iteration": 2.6325736045837402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075634, + "balance_loss_mlp": 1.04113472, + "epoch": 0.3253174297806849, + "flos": 449435312640.0, + "grad_norm": 0.06211432544239721, + "language_loss": 0.85214412, + "learning_rate": 0.0007882184209712245, + "loss": 0.8629005, + "num_input_tokens_seen": 139802032, + "router_z_loss_mlp": 0.34545898, + "step": 1691, + "time_per_iteration": 2.5199012756347656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076936, + "balance_loss_mlp": 1.04303288, + "epoch": 0.32550981146594843, + "flos": 703855014912.0, + "grad_norm": 0.0444021152083115, + "language_loss": 0.85646939, + "learning_rate": 0.000787963790824974, + "loss": 0.86723876, + "num_input_tokens_seen": 139885648, + "router_z_loss_mlp": 0.33935547, + "step": 1692, + "time_per_iteration": 2.9585483074188232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076864, + "balance_loss_mlp": 1.04217362, + "epoch": 0.325702193151212, + "flos": 392491643904.0, + "grad_norm": 0.06035071191190156, + "language_loss": 0.89588344, + "learning_rate": 0.0007877090488812651, + "loss": 0.90665203, + "num_input_tokens_seen": 139947920, + "router_z_loss_mlp": 0.34716797, + "step": 1693, + "time_per_iteration": 2.4247992038726807 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073167, + "balance_loss_mlp": 1.0379529, + "epoch": 0.32589457483647555, + "flos": 577223525376.0, + "grad_norm": 0.051335929306222446, + "language_loss": 0.83377099, + "learning_rate": 0.0007874541952389973, + "loss": 0.84450269, + "num_input_tokens_seen": 140020048, + "router_z_loss_mlp": 0.35253906, + "step": 1694, + "time_per_iteration": 2.679868459701538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074307, + "balance_loss_mlp": 1.03947401, + "epoch": 0.32608695652173914, + "flos": 498092834304.0, + "grad_norm": 0.051580366849716015, + "language_loss": 0.86795568, + "learning_rate": 0.0007871992299971136, + "loss": 0.87869877, + "num_input_tokens_seen": 140085600, + "router_z_loss_mlp": 0.34887695, + "step": 1695, + "time_per_iteration": 2.6005072593688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079265, + "balance_loss_mlp": 1.04431272, + "epoch": 0.32627933820700267, + "flos": 590858150400.0, + "grad_norm": 0.054409905067417906, + "language_loss": 0.84529006, + "learning_rate": 0.0007869441532546001, + "loss": 0.85608268, + "num_input_tokens_seen": 140155152, + "router_z_loss_mlp": 0.34985352, + "step": 1696, + "time_per_iteration": 2.7373292446136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071229, + "balance_loss_mlp": 1.03749299, + "epoch": 0.32647171989226625, + "flos": 608790809088.0, + "grad_norm": 0.05196776598603691, + "language_loss": 0.79551816, + "learning_rate": 0.0007866889651104867, + "loss": 0.80623043, + "num_input_tokens_seen": 140228560, + "router_z_loss_mlp": 0.33764648, + "step": 1697, + "time_per_iteration": 2.768869638442993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069598, + "balance_loss_mlp": 1.03464603, + "epoch": 0.32666410157752984, + "flos": 476896619520.0, + "grad_norm": 0.05699082390473629, + "language_loss": 0.83313, + "learning_rate": 0.000786433665663846, + "loss": 0.84382606, + "num_input_tokens_seen": 140297952, + "router_z_loss_mlp": 0.34985352, + "step": 1698, + "time_per_iteration": 2.6574184894561768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070595, + "balance_loss_mlp": 1.03664398, + "epoch": 0.3268564832627934, + "flos": 718060018176.0, + "grad_norm": 0.0499104315286651, + "language_loss": 0.86441195, + "learning_rate": 0.0007861782550137942, + "loss": 0.8751179, + "num_input_tokens_seen": 140373408, + "router_z_loss_mlp": 0.33984375, + "step": 1699, + "time_per_iteration": 2.8897016048431396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071541, + "balance_loss_mlp": 1.0379957, + "epoch": 0.32704886494805696, + "flos": 768469372416.0, + "grad_norm": 0.05892131453680714, + "language_loss": 0.85990739, + "learning_rate": 0.0007859227332594901, + "loss": 0.87062275, + "num_input_tokens_seen": 140451840, + "router_z_loss_mlp": 0.33569336, + "step": 1700, + "time_per_iteration": 2.8941755294799805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080492, + "balance_loss_mlp": 1.046803, + "epoch": 0.3272412466333205, + "flos": 849540980736.0, + "grad_norm": 0.0647173620985618, + "language_loss": 0.84537613, + "learning_rate": 0.0007856671005001365, + "loss": 0.85618103, + "num_input_tokens_seen": 140537696, + "router_z_loss_mlp": 0.3371582, + "step": 1701, + "time_per_iteration": 3.1362555027008057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107778, + "balance_loss_mlp": 1.04373336, + "epoch": 0.3274336283185841, + "flos": 831224208384.0, + "grad_norm": 0.055785838120560656, + "language_loss": 0.81608075, + "learning_rate": 0.0007854113568349787, + "loss": 0.82685852, + "num_input_tokens_seen": 140623536, + "router_z_loss_mlp": 0.34082031, + "step": 1702, + "time_per_iteration": 3.0684425830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108348, + "balance_loss_mlp": 1.04900455, + "epoch": 0.3276260100038476, + "flos": 691721938944.0, + "grad_norm": 0.059478075679183354, + "language_loss": 0.80008304, + "learning_rate": 0.0007851555023633052, + "loss": 0.81091785, + "num_input_tokens_seen": 140700688, + "router_z_loss_mlp": 0.34521484, + "step": 1703, + "time_per_iteration": 2.829838991165161 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083551, + "balance_loss_mlp": 1.04974365, + "epoch": 0.3278183916891112, + "flos": 435831211008.0, + "grad_norm": 0.05938301584715095, + "language_loss": 0.82290888, + "learning_rate": 0.0007848995371844474, + "loss": 0.83374435, + "num_input_tokens_seen": 140765808, + "router_z_loss_mlp": 0.33837891, + "step": 1704, + "time_per_iteration": 2.498462200164795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108751, + "balance_loss_mlp": 1.0532254, + "epoch": 0.3280107733743748, + "flos": 460883119104.0, + "grad_norm": 0.06015932024064871, + "language_loss": 0.80622214, + "learning_rate": 0.0007846434613977801, + "loss": 0.81709725, + "num_input_tokens_seen": 140830512, + "router_z_loss_mlp": 0.34326172, + "step": 1705, + "time_per_iteration": 2.4933369159698486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087045, + "balance_loss_mlp": 1.05330932, + "epoch": 0.3282031550596383, + "flos": 679018484736.0, + "grad_norm": 0.05558890685700398, + "language_loss": 0.78265798, + "learning_rate": 0.0007843872751027203, + "loss": 0.7935285, + "num_input_tokens_seen": 140902816, + "router_z_loss_mlp": 0.33764648, + "step": 1706, + "time_per_iteration": 2.81091046333313 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090191, + "balance_loss_mlp": 1.05621648, + "epoch": 0.3283955367449019, + "flos": 544821023232.0, + "grad_norm": 0.10097233050810657, + "language_loss": 0.87312186, + "learning_rate": 0.0007841309783987287, + "loss": 0.88402379, + "num_input_tokens_seen": 140975488, + "router_z_loss_mlp": 0.34008789, + "step": 1707, + "time_per_iteration": 2.7456212043762207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083118, + "balance_loss_mlp": 1.0490005, + "epoch": 0.32858791843016544, + "flos": 481017153024.0, + "grad_norm": 0.06288690811568091, + "language_loss": 0.89185357, + "learning_rate": 0.0007838745713853084, + "loss": 0.90268475, + "num_input_tokens_seen": 141043248, + "router_z_loss_mlp": 0.34155273, + "step": 1708, + "time_per_iteration": 2.5734565258026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086944, + "balance_loss_mlp": 1.0529933, + "epoch": 0.328780300115429, + "flos": 566529389568.0, + "grad_norm": 0.059735917623485235, + "language_loss": 0.84101981, + "learning_rate": 0.0007836180541620053, + "loss": 0.85188925, + "num_input_tokens_seen": 141119408, + "router_z_loss_mlp": 0.33984375, + "step": 1709, + "time_per_iteration": 2.6734848022460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082239, + "balance_loss_mlp": 1.04843152, + "epoch": 0.32897268180069256, + "flos": 475787948544.0, + "grad_norm": 0.06557165815913592, + "language_loss": 0.86666405, + "learning_rate": 0.0007833614268284082, + "loss": 0.87748647, + "num_input_tokens_seen": 141184112, + "router_z_loss_mlp": 0.33813477, + "step": 1710, + "time_per_iteration": 2.5004236698150635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109204, + "balance_loss_mlp": 1.07611382, + "epoch": 0.32916506348595614, + "flos": 1576517008896.0, + "grad_norm": 0.028486921929439343, + "language_loss": 0.74109769, + "learning_rate": 0.0007831046894841489, + "loss": 0.75201809, + "num_input_tokens_seen": 141414960, + "router_z_loss_mlp": 0.15917969, + "step": 1711, + "time_per_iteration": 4.857421159744263 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084065, + "balance_loss_mlp": 1.05011439, + "epoch": 0.3293574451712197, + "flos": 482646739968.0, + "grad_norm": 0.05383776069577274, + "language_loss": 0.78376174, + "learning_rate": 0.0007828478422289016, + "loss": 0.79460239, + "num_input_tokens_seen": 141485744, + "router_z_loss_mlp": 0.33984375, + "step": 1712, + "time_per_iteration": 2.5763661861419678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089524, + "balance_loss_mlp": 1.05438161, + "epoch": 0.32954982685648326, + "flos": 622266872832.0, + "grad_norm": 0.05220026625301518, + "language_loss": 0.89185119, + "learning_rate": 0.0007825908851623833, + "loss": 0.90274644, + "num_input_tokens_seen": 141560592, + "router_z_loss_mlp": 0.35205078, + "step": 1713, + "time_per_iteration": 2.7262768745422363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081531, + "balance_loss_mlp": 1.04648352, + "epoch": 0.32974220854174685, + "flos": 544697367552.0, + "grad_norm": 0.06806070360888057, + "language_loss": 0.85360491, + "learning_rate": 0.0007823338183843533, + "loss": 0.86442018, + "num_input_tokens_seen": 141630400, + "router_z_loss_mlp": 0.35083008, + "step": 1714, + "time_per_iteration": 2.652278184890747 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078166, + "balance_loss_mlp": 1.0447638, + "epoch": 0.3299345902270104, + "flos": 981740708352.0, + "grad_norm": 0.05603975865081876, + "language_loss": 0.8075726, + "learning_rate": 0.0007820766419946141, + "loss": 0.81835425, + "num_input_tokens_seen": 141721552, + "router_z_loss_mlp": 0.33422852, + "step": 1715, + "time_per_iteration": 3.282278537750244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087722, + "balance_loss_mlp": 1.07227242, + "epoch": 0.33012697191227397, + "flos": 1402857432576.0, + "grad_norm": 0.02753251532821737, + "language_loss": 0.7967248, + "learning_rate": 0.0007818193560930102, + "loss": 0.80760199, + "num_input_tokens_seen": 141956464, + "router_z_loss_mlp": 0.15429688, + "step": 1716, + "time_per_iteration": 4.925649881362915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081747, + "balance_loss_mlp": 1.04789162, + "epoch": 0.3303193535975375, + "flos": 504897781248.0, + "grad_norm": 0.09582479469105179, + "language_loss": 0.75968826, + "learning_rate": 0.0007815619607794288, + "loss": 0.77050573, + "num_input_tokens_seen": 142029552, + "router_z_loss_mlp": 0.33886719, + "step": 1717, + "time_per_iteration": 2.653355598449707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077451, + "balance_loss_mlp": 1.04423952, + "epoch": 0.3305117352828011, + "flos": 937602390528.0, + "grad_norm": 0.059316474336830904, + "language_loss": 0.8254683, + "learning_rate": 0.0007813044561538001, + "loss": 0.83624279, + "num_input_tokens_seen": 142117344, + "router_z_loss_mlp": 0.33227539, + "step": 1718, + "time_per_iteration": 3.1251535415649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084283, + "balance_loss_mlp": 1.05035567, + "epoch": 0.3307041169680646, + "flos": 721176597504.0, + "grad_norm": 0.08429030846847434, + "language_loss": 0.88411027, + "learning_rate": 0.0007810468423160958, + "loss": 0.89495313, + "num_input_tokens_seen": 142190096, + "router_z_loss_mlp": 0.33959961, + "step": 1719, + "time_per_iteration": 2.8598783016204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090007, + "balance_loss_mlp": 1.05760598, + "epoch": 0.3308964986533282, + "flos": 583315499520.0, + "grad_norm": 0.04547421634197757, + "language_loss": 0.81920642, + "learning_rate": 0.0007807891193663306, + "loss": 0.8301065, + "num_input_tokens_seen": 142265584, + "router_z_loss_mlp": 0.32397461, + "step": 1720, + "time_per_iteration": 2.7775802612304688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092448, + "balance_loss_mlp": 1.0582583, + "epoch": 0.33108888033859174, + "flos": 473340672000.0, + "grad_norm": 0.07591254280459368, + "language_loss": 0.82440275, + "learning_rate": 0.0007805312874045614, + "loss": 0.83532727, + "num_input_tokens_seen": 142330352, + "router_z_loss_mlp": 0.34228516, + "step": 1721, + "time_per_iteration": 2.5138351917266846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100808, + "balance_loss_mlp": 1.06657076, + "epoch": 0.3312812620238553, + "flos": 385913659392.0, + "grad_norm": 0.08101052667778896, + "language_loss": 0.86919391, + "learning_rate": 0.0007802733465308874, + "loss": 0.88020205, + "num_input_tokens_seen": 142392208, + "router_z_loss_mlp": 0.34277344, + "step": 1722, + "time_per_iteration": 2.440809726715088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106056, + "balance_loss_mlp": 1.07074606, + "epoch": 0.3314736437091189, + "flos": 494292395520.0, + "grad_norm": 0.0567806329299034, + "language_loss": 0.8509872, + "learning_rate": 0.0007800152968454501, + "loss": 0.86204773, + "num_input_tokens_seen": 142462112, + "router_z_loss_mlp": 0.35375977, + "step": 1723, + "time_per_iteration": 2.61602520942688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090428, + "balance_loss_mlp": 1.056072, + "epoch": 0.33166602539438245, + "flos": 653346736128.0, + "grad_norm": 0.038882210578800376, + "language_loss": 0.90476918, + "learning_rate": 0.0007797571384484334, + "loss": 0.91567349, + "num_input_tokens_seen": 142539120, + "router_z_loss_mlp": 0.34399414, + "step": 1724, + "time_per_iteration": 2.857562780380249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090291, + "balance_loss_mlp": 1.05586314, + "epoch": 0.33185840707964603, + "flos": 520550489088.0, + "grad_norm": 0.04870849772261114, + "language_loss": 0.91599178, + "learning_rate": 0.0007794988714400633, + "loss": 0.92689478, + "num_input_tokens_seen": 142611520, + "router_z_loss_mlp": 0.34448242, + "step": 1725, + "time_per_iteration": 2.5884361267089844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097227, + "balance_loss_mlp": 1.06077266, + "epoch": 0.33205078876490957, + "flos": 436712919552.0, + "grad_norm": 0.05260760436426434, + "language_loss": 0.85199809, + "learning_rate": 0.0007792404959206079, + "loss": 0.86297035, + "num_input_tokens_seen": 142676064, + "router_z_loss_mlp": 0.36474609, + "step": 1726, + "time_per_iteration": 2.4855122566223145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095942, + "balance_loss_mlp": 1.05965447, + "epoch": 0.33224317045017315, + "flos": 768400971264.0, + "grad_norm": 0.052329818141719754, + "language_loss": 0.81527805, + "learning_rate": 0.0007789820119903774, + "loss": 0.82623744, + "num_input_tokens_seen": 142750944, + "router_z_loss_mlp": 0.36279297, + "step": 1727, + "time_per_iteration": 2.945405960083008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105991, + "balance_loss_mlp": 1.04579556, + "epoch": 0.3324355521354367, + "flos": 1465656090624.0, + "grad_norm": 0.02932642968329903, + "language_loss": 0.78492665, + "learning_rate": 0.0007787234197497242, + "loss": 0.79552573, + "num_input_tokens_seen": 142974032, + "router_z_loss_mlp": 0.14160156, + "step": 1728, + "time_per_iteration": 4.810296297073364 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084477, + "balance_loss_mlp": 1.04880977, + "epoch": 0.3326279338207003, + "flos": 496415195136.0, + "grad_norm": 0.05339720943334808, + "language_loss": 0.83919221, + "learning_rate": 0.0007784647192990428, + "loss": 0.85003698, + "num_input_tokens_seen": 143047280, + "router_z_loss_mlp": 0.35693359, + "step": 1729, + "time_per_iteration": 2.6848132610321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083753, + "balance_loss_mlp": 1.04844344, + "epoch": 0.33282031550596386, + "flos": 635600342016.0, + "grad_norm": 0.05212570885713578, + "language_loss": 0.80661625, + "learning_rate": 0.0007782059107387696, + "loss": 0.81745386, + "num_input_tokens_seen": 143124224, + "router_z_loss_mlp": 0.35351562, + "step": 1730, + "time_per_iteration": 2.874356269836426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078294, + "balance_loss_mlp": 1.04329371, + "epoch": 0.3330126971912274, + "flos": 689210643456.0, + "grad_norm": 0.05636936917064103, + "language_loss": 0.88407743, + "learning_rate": 0.0007779469941693826, + "loss": 0.89486033, + "num_input_tokens_seen": 143194048, + "router_z_loss_mlp": 0.3503418, + "step": 1731, + "time_per_iteration": 2.7914862632751465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079351, + "balance_loss_mlp": 1.04511368, + "epoch": 0.333205078876491, + "flos": 566184563712.0, + "grad_norm": 0.05730145040609657, + "language_loss": 0.77017218, + "learning_rate": 0.0007776879696914029, + "loss": 0.78096569, + "num_input_tokens_seen": 143272976, + "router_z_loss_mlp": 0.34277344, + "step": 1732, + "time_per_iteration": 2.8158769607543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081, + "balance_loss_mlp": 1.04666734, + "epoch": 0.3333974605617545, + "flos": 640618550784.0, + "grad_norm": 0.044212495165629015, + "language_loss": 0.8903594, + "learning_rate": 0.000777428837405392, + "loss": 0.90116942, + "num_input_tokens_seen": 143346496, + "router_z_loss_mlp": 0.34375, + "step": 1733, + "time_per_iteration": 2.8417906761169434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079717, + "balance_loss_mlp": 1.04536092, + "epoch": 0.3335898422470181, + "flos": 461597501952.0, + "grad_norm": 0.05109390766697835, + "language_loss": 0.87070495, + "learning_rate": 0.0007771695974119544, + "loss": 0.88150203, + "num_input_tokens_seen": 143410448, + "router_z_loss_mlp": 0.34399414, + "step": 1734, + "time_per_iteration": 2.4995057582855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078764, + "balance_loss_mlp": 1.04514742, + "epoch": 0.33378222393228163, + "flos": 852504791040.0, + "grad_norm": 0.05825672376588237, + "language_loss": 0.75576115, + "learning_rate": 0.0007769102498117359, + "loss": 0.76654887, + "num_input_tokens_seen": 143492416, + "router_z_loss_mlp": 0.33642578, + "step": 1735, + "time_per_iteration": 3.0868663787841797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083295, + "balance_loss_mlp": 1.04991651, + "epoch": 0.3339746056175452, + "flos": 954256080384.0, + "grad_norm": 0.05069255593645712, + "language_loss": 0.79858601, + "learning_rate": 0.000776650794705424, + "loss": 0.80941892, + "num_input_tokens_seen": 143590096, + "router_z_loss_mlp": 0.33398438, + "step": 1736, + "time_per_iteration": 3.2665328979492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108809, + "balance_loss_mlp": 1.05387688, + "epoch": 0.33416698730280875, + "flos": 544559155200.0, + "grad_norm": 0.045819605067785145, + "language_loss": 0.82160866, + "learning_rate": 0.0007763912321937483, + "loss": 0.83248949, + "num_input_tokens_seen": 143663344, + "router_z_loss_mlp": 0.34228516, + "step": 1737, + "time_per_iteration": 2.677316665649414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081816, + "balance_loss_mlp": 1.04798412, + "epoch": 0.33435936898807234, + "flos": 1013652817920.0, + "grad_norm": 0.053421386657792044, + "language_loss": 0.82471478, + "learning_rate": 0.0007761315623774799, + "loss": 0.8355329, + "num_input_tokens_seen": 143753072, + "router_z_loss_mlp": 0.33862305, + "step": 1738, + "time_per_iteration": 3.4182283878326416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089571, + "balance_loss_mlp": 1.05554879, + "epoch": 0.3345517506733359, + "flos": 614935217664.0, + "grad_norm": 0.051536505858366714, + "language_loss": 0.87671852, + "learning_rate": 0.0007758717853574313, + "loss": 0.88761419, + "num_input_tokens_seen": 143827280, + "router_z_loss_mlp": 0.34057617, + "step": 1739, + "time_per_iteration": 2.7348380088806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084561, + "balance_loss_mlp": 1.05125391, + "epoch": 0.33474413235859946, + "flos": 494350622208.0, + "grad_norm": 0.06141180611747274, + "language_loss": 0.9002257, + "learning_rate": 0.0007756119012344571, + "loss": 0.91107136, + "num_input_tokens_seen": 143895072, + "router_z_loss_mlp": 0.33325195, + "step": 1740, + "time_per_iteration": 2.536121129989624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091135, + "balance_loss_mlp": 1.05754209, + "epoch": 0.33493651404386304, + "flos": 628105743360.0, + "grad_norm": 0.06662069566578578, + "language_loss": 0.84404671, + "learning_rate": 0.0007753519101094535, + "loss": 0.85495806, + "num_input_tokens_seen": 143965728, + "router_z_loss_mlp": 0.33618164, + "step": 1741, + "time_per_iteration": 2.753371238708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082833, + "balance_loss_mlp": 1.04945421, + "epoch": 0.3351288957291266, + "flos": 513474909696.0, + "grad_norm": 0.05750412427252262, + "language_loss": 0.86366677, + "learning_rate": 0.0007750918120833575, + "loss": 0.87449515, + "num_input_tokens_seen": 144030272, + "router_z_loss_mlp": 0.33398438, + "step": 1742, + "time_per_iteration": 2.56093168258667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082276, + "balance_loss_mlp": 1.05037546, + "epoch": 0.33532127741439016, + "flos": 647008860672.0, + "grad_norm": 0.0676260973392943, + "language_loss": 0.87342101, + "learning_rate": 0.0007748316072571485, + "loss": 0.88424373, + "num_input_tokens_seen": 144104048, + "router_z_loss_mlp": 0.31884766, + "step": 1743, + "time_per_iteration": 2.7759556770324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086123, + "balance_loss_mlp": 1.05193388, + "epoch": 0.3355136590996537, + "flos": 768134721024.0, + "grad_norm": 0.047185436483198326, + "language_loss": 0.79306734, + "learning_rate": 0.0007745712957318467, + "loss": 0.80392861, + "num_input_tokens_seen": 144180432, + "router_z_loss_mlp": 0.34228516, + "step": 1744, + "time_per_iteration": 2.959686756134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085284, + "balance_loss_mlp": 1.05119014, + "epoch": 0.3357060407849173, + "flos": 595259490816.0, + "grad_norm": 0.046948111550021425, + "language_loss": 0.86506951, + "learning_rate": 0.0007743108776085141, + "loss": 0.87592232, + "num_input_tokens_seen": 144258704, + "router_z_loss_mlp": 0.34106445, + "step": 1745, + "time_per_iteration": 2.7391204833984375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089169, + "balance_loss_mlp": 1.05462217, + "epoch": 0.3358984224701808, + "flos": 598288730112.0, + "grad_norm": 0.04983419543630797, + "language_loss": 0.82728243, + "learning_rate": 0.0007740503529882543, + "loss": 0.8381741, + "num_input_tokens_seen": 144335104, + "router_z_loss_mlp": 0.34594727, + "step": 1746, + "time_per_iteration": 2.788041114807129 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108953, + "balance_loss_mlp": 1.05474496, + "epoch": 0.3360908041554444, + "flos": 578055771648.0, + "grad_norm": 0.05677254755827829, + "language_loss": 0.91252941, + "learning_rate": 0.0007737897219722114, + "loss": 0.92342472, + "num_input_tokens_seen": 144402912, + "router_z_loss_mlp": 0.34790039, + "step": 1747, + "time_per_iteration": 2.6752376556396484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083715, + "balance_loss_mlp": 1.04945374, + "epoch": 0.336283185840708, + "flos": 513332315136.0, + "grad_norm": 0.05427874766165502, + "language_loss": 0.81146061, + "learning_rate": 0.0007735289846615716, + "loss": 0.82229781, + "num_input_tokens_seen": 144475328, + "router_z_loss_mlp": 0.34301758, + "step": 1748, + "time_per_iteration": 2.670315742492676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083181, + "balance_loss_mlp": 1.04984999, + "epoch": 0.3364755675259715, + "flos": 524716102656.0, + "grad_norm": 0.05445380235157479, + "language_loss": 0.81899059, + "learning_rate": 0.0007732681411575621, + "loss": 0.82982242, + "num_input_tokens_seen": 144548288, + "router_z_loss_mlp": 0.33349609, + "step": 1749, + "time_per_iteration": 2.644740104675293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079502, + "balance_loss_mlp": 1.04567027, + "epoch": 0.3366679492112351, + "flos": 554594162688.0, + "grad_norm": 0.05291201013717534, + "language_loss": 0.87517959, + "learning_rate": 0.0007730071915614514, + "loss": 0.88597459, + "num_input_tokens_seen": 144619488, + "router_z_loss_mlp": 0.33862305, + "step": 1750, + "time_per_iteration": 2.6605777740478516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082694, + "balance_loss_mlp": 1.04874277, + "epoch": 0.33686033089649864, + "flos": 427051851264.0, + "grad_norm": 0.07867660779661921, + "language_loss": 0.88976741, + "learning_rate": 0.0007727461359745489, + "loss": 0.90059435, + "num_input_tokens_seen": 144682560, + "router_z_loss_mlp": 0.33984375, + "step": 1751, + "time_per_iteration": 2.4562768936157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082471, + "balance_loss_mlp": 1.04987907, + "epoch": 0.3370527125817622, + "flos": 541452750336.0, + "grad_norm": 0.05472309390748721, + "language_loss": 0.86156446, + "learning_rate": 0.0007724849744982056, + "loss": 0.87238914, + "num_input_tokens_seen": 144753328, + "router_z_loss_mlp": 0.32592773, + "step": 1752, + "time_per_iteration": 2.683575391769409 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086284, + "balance_loss_mlp": 1.05295336, + "epoch": 0.33724509426702576, + "flos": 541836864000.0, + "grad_norm": 0.052181206472060114, + "language_loss": 0.81578106, + "learning_rate": 0.0007722237072338131, + "loss": 0.82664388, + "num_input_tokens_seen": 144827312, + "router_z_loss_mlp": 0.33349609, + "step": 1753, + "time_per_iteration": 2.7059788703918457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108949, + "balance_loss_mlp": 1.05563486, + "epoch": 0.33743747595228935, + "flos": 472557888000.0, + "grad_norm": 0.063588606701447, + "language_loss": 0.85402888, + "learning_rate": 0.0007719623342828046, + "loss": 0.86492383, + "num_input_tokens_seen": 144893488, + "router_z_loss_mlp": 0.33886719, + "step": 1754, + "time_per_iteration": 2.5117459297180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090728, + "balance_loss_mlp": 1.05708706, + "epoch": 0.33762985763755293, + "flos": 469564964352.0, + "grad_norm": 0.05602573096673387, + "language_loss": 0.84115714, + "learning_rate": 0.000771700855746654, + "loss": 0.85206437, + "num_input_tokens_seen": 144961152, + "router_z_loss_mlp": 0.33666992, + "step": 1755, + "time_per_iteration": 2.5685064792633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085423, + "balance_loss_mlp": 1.05214, + "epoch": 0.33782223932281646, + "flos": 492002270208.0, + "grad_norm": 0.05352941428578995, + "language_loss": 0.88329422, + "learning_rate": 0.0007714392717268763, + "loss": 0.89414847, + "num_input_tokens_seen": 145030576, + "router_z_loss_mlp": 0.33300781, + "step": 1756, + "time_per_iteration": 2.568432569503784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084365, + "balance_loss_mlp": 1.04981852, + "epoch": 0.33801462100808005, + "flos": 464827562496.0, + "grad_norm": 0.051807056092833426, + "language_loss": 0.86368155, + "learning_rate": 0.0007711775823250273, + "loss": 0.87452519, + "num_input_tokens_seen": 145095648, + "router_z_loss_mlp": 0.34594727, + "step": 1757, + "time_per_iteration": 2.5542263984680176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010902, + "balance_loss_mlp": 1.05660665, + "epoch": 0.3382070026933436, + "flos": 795319603200.0, + "grad_norm": 0.05510084593487172, + "language_loss": 0.83019066, + "learning_rate": 0.0007709157876427039, + "loss": 0.84109271, + "num_input_tokens_seen": 145181248, + "router_z_loss_mlp": 0.33618164, + "step": 1758, + "time_per_iteration": 3.07852840423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082149, + "balance_loss_mlp": 1.04903245, + "epoch": 0.33839938437860717, + "flos": 508181686272.0, + "grad_norm": 0.0524958838474987, + "language_loss": 0.85602981, + "learning_rate": 0.0007706538877815439, + "loss": 0.86685127, + "num_input_tokens_seen": 145252944, + "router_z_loss_mlp": 0.33129883, + "step": 1759, + "time_per_iteration": 2.6002085208892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082716, + "balance_loss_mlp": 1.05021918, + "epoch": 0.3385917660638707, + "flos": 483986755584.0, + "grad_norm": 0.05079207863068971, + "language_loss": 0.83150595, + "learning_rate": 0.0007703918828432259, + "loss": 0.84233308, + "num_input_tokens_seen": 145323168, + "router_z_loss_mlp": 0.32495117, + "step": 1760, + "time_per_iteration": 2.5961215496063232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086286, + "balance_loss_mlp": 1.05297899, + "epoch": 0.3387841477491343, + "flos": 545071306752.0, + "grad_norm": 0.0542286668270813, + "language_loss": 0.89021361, + "learning_rate": 0.000770129772929469, + "loss": 0.9010765, + "num_input_tokens_seen": 145395776, + "router_z_loss_mlp": 0.33325195, + "step": 1761, + "time_per_iteration": 2.6393394470214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076289, + "balance_loss_mlp": 1.04264784, + "epoch": 0.3389765294343978, + "flos": 719487373824.0, + "grad_norm": 0.057381721603975526, + "language_loss": 0.88803625, + "learning_rate": 0.0007698675581420334, + "loss": 0.89879912, + "num_input_tokens_seen": 145470576, + "router_z_loss_mlp": 0.33666992, + "step": 1762, + "time_per_iteration": 2.8959014415740967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073065, + "balance_loss_mlp": 1.03968656, + "epoch": 0.3391689111196614, + "flos": 699596269056.0, + "grad_norm": 0.05381480837735757, + "language_loss": 0.78922743, + "learning_rate": 0.0007696052385827199, + "loss": 0.79995811, + "num_input_tokens_seen": 145548896, + "router_z_loss_mlp": 0.33398438, + "step": 1763, + "time_per_iteration": 2.9110584259033203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068767, + "balance_loss_mlp": 1.03519773, + "epoch": 0.339361292804925, + "flos": 626806425600.0, + "grad_norm": 0.05521588721088573, + "language_loss": 0.78407156, + "learning_rate": 0.00076934281435337, + "loss": 0.79475927, + "num_input_tokens_seen": 145617136, + "router_z_loss_mlp": 0.33569336, + "step": 1764, + "time_per_iteration": 2.7673115730285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073935, + "balance_loss_mlp": 1.04043674, + "epoch": 0.33955367449018853, + "flos": 609302960640.0, + "grad_norm": 0.0615155635578628, + "language_loss": 0.85995364, + "learning_rate": 0.0007690802855558658, + "loss": 0.87069303, + "num_input_tokens_seen": 145696416, + "router_z_loss_mlp": 0.33520508, + "step": 1765, + "time_per_iteration": 2.871255397796631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083714, + "balance_loss_mlp": 1.07131636, + "epoch": 0.3397460561754521, + "flos": 1452494177280.0, + "grad_norm": 0.03113174858532202, + "language_loss": 0.76374954, + "learning_rate": 0.0007688176522921302, + "loss": 0.77458668, + "num_input_tokens_seen": 145919680, + "router_z_loss_mlp": 0.12402344, + "step": 1766, + "time_per_iteration": 4.906975746154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089831, + "balance_loss_mlp": 1.05485463, + "epoch": 0.33993843786071565, + "flos": 487068429312.0, + "grad_norm": 0.059784397062932884, + "language_loss": 0.89060128, + "learning_rate": 0.0007685549146641262, + "loss": 0.90149957, + "num_input_tokens_seen": 145984272, + "router_z_loss_mlp": 0.35009766, + "step": 1767, + "time_per_iteration": 2.5172948837280273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081834, + "balance_loss_mlp": 1.04683375, + "epoch": 0.34013081954597923, + "flos": 417115768320.0, + "grad_norm": 0.05470212710373979, + "language_loss": 0.88085568, + "learning_rate": 0.0007682920727738579, + "loss": 0.89167398, + "num_input_tokens_seen": 146047248, + "router_z_loss_mlp": 0.35058594, + "step": 1768, + "time_per_iteration": 2.4423539638519287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084324, + "balance_loss_mlp": 1.04939604, + "epoch": 0.34032320123124277, + "flos": 437293472256.0, + "grad_norm": 0.06228189549734304, + "language_loss": 0.84428132, + "learning_rate": 0.000768029126723369, + "loss": 0.85512453, + "num_input_tokens_seen": 146111872, + "router_z_loss_mlp": 0.34960938, + "step": 1769, + "time_per_iteration": 2.5054280757904053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082733, + "balance_loss_mlp": 1.04811513, + "epoch": 0.34051558291650635, + "flos": 457353312768.0, + "grad_norm": 0.058774755629116764, + "language_loss": 0.81489038, + "learning_rate": 0.0007677660766147447, + "loss": 0.82571769, + "num_input_tokens_seen": 146172608, + "router_z_loss_mlp": 0.34667969, + "step": 1770, + "time_per_iteration": 2.524327039718628 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01030223, + "balance_loss_mlp": 1.01844561, + "epoch": 0.3407079646017699, + "flos": 1558029938688.0, + "grad_norm": 0.017684799672329513, + "language_loss": 0.72470945, + "learning_rate": 0.0007675029225501102, + "loss": 0.7350117, + "num_input_tokens_seen": 146413584, + "router_z_loss_mlp": 0.11767578, + "step": 1771, + "time_per_iteration": 4.924427032470703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081114, + "balance_loss_mlp": 1.04666233, + "epoch": 0.3409003462870335, + "flos": 492312190464.0, + "grad_norm": 0.06375677891517043, + "language_loss": 0.79619604, + "learning_rate": 0.0007672396646316306, + "loss": 0.80700719, + "num_input_tokens_seen": 146476992, + "router_z_loss_mlp": 0.3449707, + "step": 1772, + "time_per_iteration": 2.5239012241363525 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081305, + "balance_loss_mlp": 1.04589987, + "epoch": 0.34109272797229706, + "flos": 808145303040.0, + "grad_norm": 0.06003817873980187, + "language_loss": 0.80518734, + "learning_rate": 0.000766976302961512, + "loss": 0.81600046, + "num_input_tokens_seen": 146552848, + "router_z_loss_mlp": 0.35424805, + "step": 1773, + "time_per_iteration": 2.9730074405670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083157, + "balance_loss_mlp": 1.04834807, + "epoch": 0.3412851096575606, + "flos": 469903997952.0, + "grad_norm": 0.05958263274361502, + "language_loss": 0.81420594, + "learning_rate": 0.0007667128376420003, + "loss": 0.82503754, + "num_input_tokens_seen": 146617504, + "router_z_loss_mlp": 0.34863281, + "step": 1774, + "time_per_iteration": 2.521329879760742 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073378, + "balance_loss_mlp": 1.03842556, + "epoch": 0.3414774913428242, + "flos": 595402085376.0, + "grad_norm": 0.09709010294240925, + "language_loss": 0.84563607, + "learning_rate": 0.0007664492687753817, + "loss": 0.85636985, + "num_input_tokens_seen": 146691568, + "router_z_loss_mlp": 0.34985352, + "step": 1775, + "time_per_iteration": 2.6744766235351562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072901, + "balance_loss_mlp": 1.03976059, + "epoch": 0.3416698730280877, + "flos": 527202667008.0, + "grad_norm": 0.05030413358353647, + "language_loss": 0.81513566, + "learning_rate": 0.000766185596463983, + "loss": 0.82586467, + "num_input_tokens_seen": 146764208, + "router_z_loss_mlp": 0.33154297, + "step": 1776, + "time_per_iteration": 2.6050221920013428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073672, + "balance_loss_mlp": 1.03962612, + "epoch": 0.3418622547133513, + "flos": 874272794112.0, + "grad_norm": 0.05039515754698922, + "language_loss": 0.76683038, + "learning_rate": 0.0007659218208101706, + "loss": 0.77756709, + "num_input_tokens_seen": 146847744, + "router_z_loss_mlp": 0.34082031, + "step": 1777, + "time_per_iteration": 3.0900516510009766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079189, + "balance_loss_mlp": 1.04504752, + "epoch": 0.34205463639861483, + "flos": 603462680064.0, + "grad_norm": 0.04915159817243754, + "language_loss": 0.84680861, + "learning_rate": 0.0007656579419163515, + "loss": 0.85760045, + "num_input_tokens_seen": 146918336, + "router_z_loss_mlp": 0.34179688, + "step": 1778, + "time_per_iteration": 2.7328884601593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081812, + "balance_loss_mlp": 1.04774237, + "epoch": 0.3422470180838784, + "flos": 463547183616.0, + "grad_norm": 0.05230649511498847, + "language_loss": 0.76939148, + "learning_rate": 0.0007653939598849724, + "loss": 0.7802096, + "num_input_tokens_seen": 146982496, + "router_z_loss_mlp": 0.34106445, + "step": 1779, + "time_per_iteration": 2.5020573139190674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0102719, + "balance_loss_mlp": 1.01441097, + "epoch": 0.34243939976914195, + "flos": 1585584377856.0, + "grad_norm": 0.019842751190116498, + "language_loss": 0.82880205, + "learning_rate": 0.0007651298748185204, + "loss": 0.83907396, + "num_input_tokens_seen": 147213600, + "router_z_loss_mlp": 0.12792969, + "step": 1780, + "time_per_iteration": 4.919352054595947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107992, + "balance_loss_mlp": 1.04656482, + "epoch": 0.34263178145440554, + "flos": 872662146048.0, + "grad_norm": 0.0514393238831889, + "language_loss": 0.80344206, + "learning_rate": 0.000764865686819522, + "loss": 0.81424129, + "num_input_tokens_seen": 147287664, + "router_z_loss_mlp": 0.33374023, + "step": 1781, + "time_per_iteration": 3.059682846069336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089318, + "balance_loss_mlp": 1.05546236, + "epoch": 0.3428241631396691, + "flos": 506630674944.0, + "grad_norm": 0.04318417455303755, + "language_loss": 0.85701579, + "learning_rate": 0.0007646013959905449, + "loss": 0.86790895, + "num_input_tokens_seen": 147356800, + "router_z_loss_mlp": 0.33886719, + "step": 1782, + "time_per_iteration": 2.572772741317749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085097, + "balance_loss_mlp": 1.05162311, + "epoch": 0.34301654482493266, + "flos": 879669324288.0, + "grad_norm": 0.05640606275212692, + "language_loss": 0.80626374, + "learning_rate": 0.0007643370024341949, + "loss": 0.81711471, + "num_input_tokens_seen": 147432496, + "router_z_loss_mlp": 0.33496094, + "step": 1783, + "time_per_iteration": 3.0805578231811523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089012, + "balance_loss_mlp": 1.05472708, + "epoch": 0.34320892651019624, + "flos": 431537559552.0, + "grad_norm": 0.05116039291223259, + "language_loss": 0.82947731, + "learning_rate": 0.0007640725062531195, + "loss": 0.84036732, + "num_input_tokens_seen": 147495856, + "router_z_loss_mlp": 0.34326172, + "step": 1784, + "time_per_iteration": 2.497859239578247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083577, + "balance_loss_mlp": 1.05010295, + "epoch": 0.3434013081954598, + "flos": 463404589056.0, + "grad_norm": 0.06763804466989645, + "language_loss": 0.86272931, + "learning_rate": 0.0007638079075500047, + "loss": 0.87356508, + "num_input_tokens_seen": 147559632, + "router_z_loss_mlp": 0.33496094, + "step": 1785, + "time_per_iteration": 2.516842842102051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017655, + "balance_loss_mlp": 1.0058769, + "epoch": 0.34359368988072336, + "flos": 1556499276288.0, + "grad_norm": 0.01279941843938601, + "language_loss": 0.75180668, + "learning_rate": 0.0007635432064275772, + "loss": 0.76198322, + "num_input_tokens_seen": 147794576, + "router_z_loss_mlp": 0.11767578, + "step": 1786, + "time_per_iteration": 4.979317665100098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077073, + "balance_loss_mlp": 1.04417157, + "epoch": 0.3437860715659869, + "flos": 495267236352.0, + "grad_norm": 0.04590480874587016, + "language_loss": 0.83035767, + "learning_rate": 0.0007632784029886026, + "loss": 0.84112841, + "num_input_tokens_seen": 147866960, + "router_z_loss_mlp": 0.32910156, + "step": 1787, + "time_per_iteration": 2.6075103282928467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079505, + "balance_loss_mlp": 1.04617453, + "epoch": 0.3439784532512505, + "flos": 717942154752.0, + "grad_norm": 0.04559278353066439, + "language_loss": 0.85611933, + "learning_rate": 0.0007630134973358873, + "loss": 0.86691439, + "num_input_tokens_seen": 147947808, + "router_z_loss_mlp": 0.33349609, + "step": 1788, + "time_per_iteration": 2.917405366897583 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086657, + "balance_loss_mlp": 1.05327868, + "epoch": 0.34417083493651407, + "flos": 565598218752.0, + "grad_norm": 0.05301353071730806, + "language_loss": 0.86864436, + "learning_rate": 0.0007627484895722763, + "loss": 0.879511, + "num_input_tokens_seen": 148015936, + "router_z_loss_mlp": 0.33398438, + "step": 1789, + "time_per_iteration": 2.6353273391723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081939, + "balance_loss_mlp": 1.04834569, + "epoch": 0.3443632166217776, + "flos": 795988905984.0, + "grad_norm": 0.057022653970397005, + "language_loss": 0.80155563, + "learning_rate": 0.0007624833798006552, + "loss": 0.81237495, + "num_input_tokens_seen": 148099776, + "router_z_loss_mlp": 0.3359375, + "step": 1790, + "time_per_iteration": 3.039126396179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090848, + "balance_loss_mlp": 1.05692101, + "epoch": 0.3445555983070412, + "flos": 569045067264.0, + "grad_norm": 0.05940117534987587, + "language_loss": 0.84113955, + "learning_rate": 0.0007622181681239483, + "loss": 0.85204804, + "num_input_tokens_seen": 148169616, + "router_z_loss_mlp": 0.33935547, + "step": 1791, + "time_per_iteration": 2.6392083168029785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083434, + "balance_loss_mlp": 1.04903054, + "epoch": 0.3447479799923047, + "flos": 568524151296.0, + "grad_norm": 0.04492792711883196, + "language_loss": 0.84501636, + "learning_rate": 0.0007619528546451202, + "loss": 0.8558507, + "num_input_tokens_seen": 148247824, + "router_z_loss_mlp": 0.34448242, + "step": 1792, + "time_per_iteration": 2.776982069015503 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080197, + "balance_loss_mlp": 1.04708052, + "epoch": 0.3449403616775683, + "flos": 967323299328.0, + "grad_norm": 0.05878857203246004, + "language_loss": 0.8358798, + "learning_rate": 0.0007616874394671745, + "loss": 0.84668171, + "num_input_tokens_seen": 148333040, + "router_z_loss_mlp": 0.33129883, + "step": 1793, + "time_per_iteration": 3.3427693843841553 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074615, + "balance_loss_mlp": 1.04128361, + "epoch": 0.34513274336283184, + "flos": 568340858880.0, + "grad_norm": 0.05893035372227358, + "language_loss": 0.84961653, + "learning_rate": 0.0007614219226931547, + "loss": 0.86036265, + "num_input_tokens_seen": 148401840, + "router_z_loss_mlp": 0.33349609, + "step": 1794, + "time_per_iteration": 2.6591315269470215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070169, + "balance_loss_mlp": 1.03783977, + "epoch": 0.3453251250480954, + "flos": 460715793408.0, + "grad_norm": 0.06432823181520617, + "language_loss": 0.84724808, + "learning_rate": 0.0007611563044261435, + "loss": 0.85794979, + "num_input_tokens_seen": 148466576, + "router_z_loss_mlp": 0.32324219, + "step": 1795, + "time_per_iteration": 2.51755690574646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078106, + "balance_loss_mlp": 1.0443697, + "epoch": 0.34551750673335896, + "flos": 415397431296.0, + "grad_norm": 0.0640589434438139, + "language_loss": 0.87120652, + "learning_rate": 0.0007608905847692631, + "loss": 0.88198757, + "num_input_tokens_seen": 148530016, + "router_z_loss_mlp": 0.33764648, + "step": 1796, + "time_per_iteration": 2.47190260887146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074227, + "balance_loss_mlp": 1.04103947, + "epoch": 0.34570988841862255, + "flos": 587540749824.0, + "grad_norm": 0.04642061059617041, + "language_loss": 0.86689866, + "learning_rate": 0.0007606247638256749, + "loss": 0.8776409, + "num_input_tokens_seen": 148610064, + "router_z_loss_mlp": 0.33203125, + "step": 1797, + "time_per_iteration": 2.956444025039673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094984, + "balance_loss_mlp": 1.08373046, + "epoch": 0.34590227010388613, + "flos": 1566835439616.0, + "grad_norm": 0.041839887126655914, + "language_loss": 0.78170294, + "learning_rate": 0.0007603588416985798, + "loss": 0.79265279, + "num_input_tokens_seen": 148835872, + "router_z_loss_mlp": 0.11230469, + "step": 1798, + "time_per_iteration": 4.9134039878845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01064584, + "balance_loss_mlp": 1.05352104, + "epoch": 0.34609465178914967, + "flos": 1536950177280.0, + "grad_norm": 0.029939636480755576, + "language_loss": 0.79327202, + "learning_rate": 0.0007600928184912179, + "loss": 0.80391788, + "num_input_tokens_seen": 149066864, + "router_z_loss_mlp": 0.11083984, + "step": 1799, + "time_per_iteration": 4.743322849273682 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083924, + "balance_loss_mlp": 1.05054486, + "epoch": 0.34628703347441325, + "flos": 609075998208.0, + "grad_norm": 0.0564087129204809, + "language_loss": 0.85731971, + "learning_rate": 0.0007598266943068686, + "loss": 0.86815894, + "num_input_tokens_seen": 149141600, + "router_z_loss_mlp": 0.33398438, + "step": 1800, + "time_per_iteration": 2.7374043464660645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077797, + "balance_loss_mlp": 1.04603946, + "epoch": 0.3464794151596768, + "flos": 473084596224.0, + "grad_norm": 0.06346922489791823, + "language_loss": 0.83911705, + "learning_rate": 0.0007595604692488507, + "loss": 0.849895, + "num_input_tokens_seen": 149205888, + "router_z_loss_mlp": 0.31738281, + "step": 1801, + "time_per_iteration": 2.5296249389648438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074116, + "balance_loss_mlp": 1.04147625, + "epoch": 0.34667179684494037, + "flos": 605397805056.0, + "grad_norm": 0.05750507090521113, + "language_loss": 0.83014846, + "learning_rate": 0.0007592941434205215, + "loss": 0.84088963, + "num_input_tokens_seen": 149281280, + "router_z_loss_mlp": 0.32641602, + "step": 1802, + "time_per_iteration": 2.758260488510132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01017015, + "balance_loss_mlp": 1.00628662, + "epoch": 0.3468641785302039, + "flos": 1564053511680.0, + "grad_norm": 0.014489769518708178, + "language_loss": 0.73571062, + "learning_rate": 0.0007590277169252782, + "loss": 0.74588072, + "num_input_tokens_seen": 149525008, + "router_z_loss_mlp": 0.10742188, + "step": 1803, + "time_per_iteration": 5.078529119491577 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069541, + "balance_loss_mlp": 1.0363059, + "epoch": 0.3470565602154675, + "flos": 906902258688.0, + "grad_norm": 0.0666829693597375, + "language_loss": 0.79937375, + "learning_rate": 0.0007587611898665566, + "loss": 0.81006914, + "num_input_tokens_seen": 149600624, + "router_z_loss_mlp": 0.33251953, + "step": 1804, + "time_per_iteration": 3.05087947845459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078664, + "balance_loss_mlp": 1.04621565, + "epoch": 0.347248941900731, + "flos": 638613614592.0, + "grad_norm": 0.050247612363814816, + "language_loss": 0.8218019, + "learning_rate": 0.0007584945623478315, + "loss": 0.83258855, + "num_input_tokens_seen": 149674224, + "router_z_loss_mlp": 0.32446289, + "step": 1805, + "time_per_iteration": 2.8188822269439697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071735, + "balance_loss_mlp": 1.03940511, + "epoch": 0.3474413235859946, + "flos": 847009336320.0, + "grad_norm": 0.06830759319763476, + "language_loss": 0.81376302, + "learning_rate": 0.000758227834472617, + "loss": 0.82448041, + "num_input_tokens_seen": 149758688, + "router_z_loss_mlp": 0.32324219, + "step": 1806, + "time_per_iteration": 3.049736976623535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080854, + "balance_loss_mlp": 1.04771423, + "epoch": 0.3476337052712582, + "flos": 515395478016.0, + "grad_norm": 0.0580200838122141, + "language_loss": 0.77365351, + "learning_rate": 0.0007579610063444664, + "loss": 0.78446203, + "num_input_tokens_seen": 149831648, + "router_z_loss_mlp": 0.33154297, + "step": 1807, + "time_per_iteration": 2.768986701965332 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072697, + "balance_loss_mlp": 1.03993857, + "epoch": 0.34782608695652173, + "flos": 913161558528.0, + "grad_norm": 0.05804810611861273, + "language_loss": 0.8735044, + "learning_rate": 0.0007576940780669712, + "loss": 0.88423139, + "num_input_tokens_seen": 149919440, + "router_z_loss_mlp": 0.32763672, + "step": 1808, + "time_per_iteration": 3.200984477996826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073414, + "balance_loss_mlp": 1.04041636, + "epoch": 0.3480184686417853, + "flos": 773374099968.0, + "grad_norm": 0.05336970886803796, + "language_loss": 0.84611619, + "learning_rate": 0.0007574270497437624, + "loss": 0.85685027, + "num_input_tokens_seen": 150001632, + "router_z_loss_mlp": 0.33007812, + "step": 1809, + "time_per_iteration": 2.9432260990142822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069124, + "balance_loss_mlp": 1.03619814, + "epoch": 0.34821085032704885, + "flos": 576549840384.0, + "grad_norm": 0.04930975616190813, + "language_loss": 0.87883413, + "learning_rate": 0.000757159921478509, + "loss": 0.88952535, + "num_input_tokens_seen": 150077552, + "router_z_loss_mlp": 0.3293457, + "step": 1810, + "time_per_iteration": 2.769669771194458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079936, + "balance_loss_mlp": 1.06887364, + "epoch": 0.34840323201231244, + "flos": 1524176911872.0, + "grad_norm": 0.03214902088053901, + "language_loss": 0.74450636, + "learning_rate": 0.0007568926933749201, + "loss": 0.75530577, + "num_input_tokens_seen": 150295328, + "router_z_loss_mlp": 0.11083984, + "step": 1811, + "time_per_iteration": 4.764174222946167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107769, + "balance_loss_mlp": 1.04469275, + "epoch": 0.34859561369757597, + "flos": 508910625792.0, + "grad_norm": 0.059132347701423886, + "language_loss": 0.87255216, + "learning_rate": 0.0007566253655367423, + "loss": 0.88332909, + "num_input_tokens_seen": 150360496, + "router_z_loss_mlp": 0.33007812, + "step": 1812, + "time_per_iteration": 2.578930616378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073116, + "balance_loss_mlp": 1.04014218, + "epoch": 0.34878799538283956, + "flos": 548390117376.0, + "grad_norm": 0.051501554075800156, + "language_loss": 0.89574003, + "learning_rate": 0.000756357938067762, + "loss": 0.90647119, + "num_input_tokens_seen": 150432064, + "router_z_loss_mlp": 0.32983398, + "step": 1813, + "time_per_iteration": 2.6508982181549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079394, + "balance_loss_mlp": 1.04673076, + "epoch": 0.34898037706810314, + "flos": 983251021824.0, + "grad_norm": 0.051360492330316726, + "language_loss": 0.82609868, + "learning_rate": 0.0007560904110718033, + "loss": 0.8368926, + "num_input_tokens_seen": 150512176, + "router_z_loss_mlp": 0.32666016, + "step": 1814, + "time_per_iteration": 3.236894130706787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075398, + "balance_loss_mlp": 1.04185271, + "epoch": 0.3491727587533667, + "flos": 681298435584.0, + "grad_norm": 0.05446392192761228, + "language_loss": 0.83478653, + "learning_rate": 0.0007558227846527297, + "loss": 0.84554052, + "num_input_tokens_seen": 150586416, + "router_z_loss_mlp": 0.33569336, + "step": 1815, + "time_per_iteration": 2.8674044609069824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074987, + "balance_loss_mlp": 1.04175162, + "epoch": 0.34936514043863026, + "flos": 393811310592.0, + "grad_norm": 0.0691488486506015, + "language_loss": 0.83195454, + "learning_rate": 0.0007555550589144429, + "loss": 0.84270442, + "num_input_tokens_seen": 150648944, + "router_z_loss_mlp": 0.33251953, + "step": 1816, + "time_per_iteration": 2.421494722366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071292, + "balance_loss_mlp": 1.03917694, + "epoch": 0.3495575221238938, + "flos": 461120256000.0, + "grad_norm": 0.07868701205222765, + "language_loss": 0.8463372, + "learning_rate": 0.000755287233960883, + "loss": 0.85705012, + "num_input_tokens_seen": 150717200, + "router_z_loss_mlp": 0.32104492, + "step": 1817, + "time_per_iteration": 2.54315185546875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072544, + "balance_loss_mlp": 1.04023862, + "epoch": 0.3497499038091574, + "flos": 723859600896.0, + "grad_norm": 0.06602653795060065, + "language_loss": 0.77636009, + "learning_rate": 0.0007550193098960292, + "loss": 0.78708553, + "num_input_tokens_seen": 150790368, + "router_z_loss_mlp": 0.32299805, + "step": 1818, + "time_per_iteration": 2.848236560821533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076763, + "balance_loss_mlp": 1.04452837, + "epoch": 0.3499422854944209, + "flos": 827364132864.0, + "grad_norm": 0.049816715297611704, + "language_loss": 0.86387616, + "learning_rate": 0.0007547512868238988, + "loss": 0.8746438, + "num_input_tokens_seen": 150879872, + "router_z_loss_mlp": 0.32226562, + "step": 1819, + "time_per_iteration": 3.140552043914795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076553, + "balance_loss_mlp": 1.0441277, + "epoch": 0.3501346671796845, + "flos": 493214247936.0, + "grad_norm": 0.049810070694169546, + "language_loss": 0.83196282, + "learning_rate": 0.0007544831648485473, + "loss": 0.84272838, + "num_input_tokens_seen": 150953712, + "router_z_loss_mlp": 0.32421875, + "step": 1820, + "time_per_iteration": 2.7085797786712646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074414, + "balance_loss_mlp": 1.04179859, + "epoch": 0.35032704886494803, + "flos": 578479173120.0, + "grad_norm": 0.05987447994889705, + "language_loss": 0.81237, + "learning_rate": 0.0007542149440740694, + "loss": 0.82311416, + "num_input_tokens_seen": 151026192, + "router_z_loss_mlp": 0.32617188, + "step": 1821, + "time_per_iteration": 2.6648108959198 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075377, + "balance_loss_mlp": 1.0426898, + "epoch": 0.3505194305502116, + "flos": 584383472640.0, + "grad_norm": 0.06285767185927299, + "language_loss": 0.85454488, + "learning_rate": 0.000753946624604597, + "loss": 0.86529863, + "num_input_tokens_seen": 151100720, + "router_z_loss_mlp": 0.3269043, + "step": 1822, + "time_per_iteration": 2.7114102840423584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080366, + "balance_loss_mlp": 1.04722571, + "epoch": 0.3507118122354752, + "flos": 526705072128.0, + "grad_norm": 0.056571758739544044, + "language_loss": 0.88259315, + "learning_rate": 0.0007536782065443015, + "loss": 0.89339685, + "num_input_tokens_seen": 151166032, + "router_z_loss_mlp": 0.33154297, + "step": 1823, + "time_per_iteration": 2.6336190700531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077518, + "balance_loss_mlp": 1.04576099, + "epoch": 0.35090419392073874, + "flos": 511269152256.0, + "grad_norm": 0.06612506998948281, + "language_loss": 0.74917412, + "learning_rate": 0.0007534096899973919, + "loss": 0.75994933, + "num_input_tokens_seen": 151232208, + "router_z_loss_mlp": 0.31738281, + "step": 1824, + "time_per_iteration": 2.5683584213256836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108279, + "balance_loss_mlp": 1.05069852, + "epoch": 0.3510965756060023, + "flos": 563728522752.0, + "grad_norm": 0.05207355522992398, + "language_loss": 0.82511663, + "learning_rate": 0.0007531410750681154, + "loss": 0.83594453, + "num_input_tokens_seen": 151308128, + "router_z_loss_mlp": 0.32080078, + "step": 1825, + "time_per_iteration": 2.7370071411132812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108428, + "balance_loss_mlp": 1.05207014, + "epoch": 0.35128895729126586, + "flos": 1020107146752.0, + "grad_norm": 0.05855996344544413, + "language_loss": 0.86223209, + "learning_rate": 0.0007528723618607575, + "loss": 0.87307489, + "num_input_tokens_seen": 151402560, + "router_z_loss_mlp": 0.32202148, + "step": 1826, + "time_per_iteration": 3.4230828285217285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080919, + "balance_loss_mlp": 1.04889941, + "epoch": 0.35148133897652944, + "flos": 587972915712.0, + "grad_norm": 0.06514472806491804, + "language_loss": 0.82370871, + "learning_rate": 0.0007526035504796422, + "loss": 0.8345179, + "num_input_tokens_seen": 151478816, + "router_z_loss_mlp": 0.32006836, + "step": 1827, + "time_per_iteration": 2.7472023963928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083394, + "balance_loss_mlp": 1.05011046, + "epoch": 0.351673720661793, + "flos": 495054830592.0, + "grad_norm": 0.13631276803870807, + "language_loss": 0.86120903, + "learning_rate": 0.0007523346410291312, + "loss": 0.87204289, + "num_input_tokens_seen": 151554528, + "router_z_loss_mlp": 0.33300781, + "step": 1828, + "time_per_iteration": 2.7665555477142334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080213, + "balance_loss_mlp": 1.04757404, + "epoch": 0.35186610234705656, + "flos": 762339520512.0, + "grad_norm": 0.04983334941453678, + "language_loss": 0.85021639, + "learning_rate": 0.0007520656336136245, + "loss": 0.86101854, + "num_input_tokens_seen": 151629440, + "router_z_loss_mlp": 0.32641602, + "step": 1829, + "time_per_iteration": 2.9405102729797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080442, + "balance_loss_mlp": 1.04847038, + "epoch": 0.3520584840323201, + "flos": 625822820352.0, + "grad_norm": 0.049266285647792965, + "language_loss": 0.87560928, + "learning_rate": 0.0007517965283375599, + "loss": 0.88641369, + "num_input_tokens_seen": 151708544, + "router_z_loss_mlp": 0.31958008, + "step": 1830, + "time_per_iteration": 2.8742456436157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076907, + "balance_loss_mlp": 1.04429162, + "epoch": 0.3522508657175837, + "flos": 537124193280.0, + "grad_norm": 0.05152278098600794, + "language_loss": 0.8913554, + "learning_rate": 0.0007515273253054132, + "loss": 0.90212452, + "num_input_tokens_seen": 151779152, + "router_z_loss_mlp": 0.32617188, + "step": 1831, + "time_per_iteration": 2.647270917892456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085251, + "balance_loss_mlp": 1.0506562, + "epoch": 0.35244324740284727, + "flos": 567105560064.0, + "grad_norm": 0.052396269804254075, + "language_loss": 0.82697165, + "learning_rate": 0.0007512580246216988, + "loss": 0.83782411, + "num_input_tokens_seen": 151853216, + "router_z_loss_mlp": 0.34643555, + "step": 1832, + "time_per_iteration": 2.6887552738189697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079591, + "balance_loss_mlp": 1.04673672, + "epoch": 0.3526356290881108, + "flos": 512809989120.0, + "grad_norm": 0.05749675796225481, + "language_loss": 0.85263908, + "learning_rate": 0.000750988626390968, + "loss": 0.86343497, + "num_input_tokens_seen": 151920416, + "router_z_loss_mlp": 0.32861328, + "step": 1833, + "time_per_iteration": 2.6013457775115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080641, + "balance_loss_mlp": 1.04781032, + "epoch": 0.3528280107733744, + "flos": 595496627712.0, + "grad_norm": 0.05344239959905588, + "language_loss": 0.84880912, + "learning_rate": 0.0007507191307178108, + "loss": 0.8596155, + "num_input_tokens_seen": 151990848, + "router_z_loss_mlp": 0.32836914, + "step": 1834, + "time_per_iteration": 2.7490363121032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080747, + "balance_loss_mlp": 1.04682052, + "epoch": 0.3530203924586379, + "flos": 550969814016.0, + "grad_norm": 0.06515988244691072, + "language_loss": 0.74431223, + "learning_rate": 0.0007504495377068543, + "loss": 0.75511968, + "num_input_tokens_seen": 152064864, + "router_z_loss_mlp": 0.33959961, + "step": 1835, + "time_per_iteration": 2.729029417037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084115, + "balance_loss_mlp": 1.04925871, + "epoch": 0.3532127741439015, + "flos": 652662876672.0, + "grad_norm": 0.06759605529963146, + "language_loss": 0.81589389, + "learning_rate": 0.0007501798474627642, + "loss": 0.82673502, + "num_input_tokens_seen": 152150096, + "router_z_loss_mlp": 0.34912109, + "step": 1836, + "time_per_iteration": 2.9048030376434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080775, + "balance_loss_mlp": 1.04708695, + "epoch": 0.35340515582916504, + "flos": 722452594176.0, + "grad_norm": 0.055893281392717674, + "language_loss": 0.83221173, + "learning_rate": 0.0007499100600902433, + "loss": 0.84301955, + "num_input_tokens_seen": 152232528, + "router_z_loss_mlp": 0.3371582, + "step": 1837, + "time_per_iteration": 2.9900574684143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080937, + "balance_loss_mlp": 1.0464375, + "epoch": 0.35359753751442863, + "flos": 594619301376.0, + "grad_norm": 0.06113982905710786, + "language_loss": 0.84191763, + "learning_rate": 0.0007496401756940324, + "loss": 0.852727, + "num_input_tokens_seen": 152299584, + "router_z_loss_mlp": 0.34545898, + "step": 1838, + "time_per_iteration": 2.6746203899383545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079632, + "balance_loss_mlp": 1.04575253, + "epoch": 0.3537899191996922, + "flos": 632384838144.0, + "grad_norm": 0.05956961248716192, + "language_loss": 0.82392603, + "learning_rate": 0.0007493701943789098, + "loss": 0.8347224, + "num_input_tokens_seen": 152370368, + "router_z_loss_mlp": 0.33886719, + "step": 1839, + "time_per_iteration": 2.773550510406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089423, + "balance_loss_mlp": 1.05590141, + "epoch": 0.35398230088495575, + "flos": 506118523392.0, + "grad_norm": 0.05410374630174333, + "language_loss": 0.82311571, + "learning_rate": 0.000749100116249692, + "loss": 0.83400989, + "num_input_tokens_seen": 152436928, + "router_z_loss_mlp": 0.33544922, + "step": 1840, + "time_per_iteration": 2.6255862712860107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089923, + "balance_loss_mlp": 1.05649722, + "epoch": 0.35417468257021933, + "flos": 507783015936.0, + "grad_norm": 0.06109989504264522, + "language_loss": 0.86315167, + "learning_rate": 0.0007488299414112321, + "loss": 0.87405092, + "num_input_tokens_seen": 152505952, + "router_z_loss_mlp": 0.33447266, + "step": 1841, + "time_per_iteration": 2.5875229835510254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088685, + "balance_loss_mlp": 1.05552149, + "epoch": 0.35436706425548287, + "flos": 656133046272.0, + "grad_norm": 0.05742985112465967, + "language_loss": 0.77833533, + "learning_rate": 0.0007485596699684215, + "loss": 0.78922212, + "num_input_tokens_seen": 152577408, + "router_z_loss_mlp": 0.33178711, + "step": 1842, + "time_per_iteration": 2.819591760635376 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092886, + "balance_loss_mlp": 1.05903101, + "epoch": 0.35455944594074645, + "flos": 652322433024.0, + "grad_norm": 0.047878329403948795, + "language_loss": 0.85455877, + "learning_rate": 0.000748289302026189, + "loss": 0.86548758, + "num_input_tokens_seen": 152654480, + "router_z_loss_mlp": 0.33886719, + "step": 1843, + "time_per_iteration": 2.829897880554199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088118, + "balance_loss_mlp": 1.05569351, + "epoch": 0.35475182762601, + "flos": 848240252928.0, + "grad_norm": 0.06279452498251797, + "language_loss": 0.85658133, + "learning_rate": 0.0007480188376895004, + "loss": 0.86746252, + "num_input_tokens_seen": 152732304, + "router_z_loss_mlp": 0.32421875, + "step": 1844, + "time_per_iteration": 3.067828893661499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085552, + "balance_loss_mlp": 1.07358336, + "epoch": 0.3549442093112736, + "flos": 1520644133376.0, + "grad_norm": 0.027370210450034033, + "language_loss": 0.7381134, + "learning_rate": 0.0007477482770633596, + "loss": 0.7489689, + "num_input_tokens_seen": 152965952, + "router_z_loss_mlp": 0.11962891, + "step": 1845, + "time_per_iteration": 4.860119342803955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087227, + "balance_loss_mlp": 1.05401564, + "epoch": 0.3551365909965371, + "flos": 651087134208.0, + "grad_norm": 0.057022057365061586, + "language_loss": 0.7840451, + "learning_rate": 0.0007474776202528074, + "loss": 0.79491735, + "num_input_tokens_seen": 153053088, + "router_z_loss_mlp": 0.33227539, + "step": 1846, + "time_per_iteration": 2.924600601196289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081626, + "balance_loss_mlp": 1.04877198, + "epoch": 0.3553289726818007, + "flos": 897094213632.0, + "grad_norm": 0.05655103479540665, + "language_loss": 0.81245291, + "learning_rate": 0.000747206867362922, + "loss": 0.82326913, + "num_input_tokens_seen": 153129216, + "router_z_loss_mlp": 0.32861328, + "step": 1847, + "time_per_iteration": 3.0635437965393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078587, + "balance_loss_mlp": 1.0454942, + "epoch": 0.3555213543670643, + "flos": 688181958144.0, + "grad_norm": 0.057996459019562165, + "language_loss": 0.83748043, + "learning_rate": 0.0007469360184988194, + "loss": 0.84826624, + "num_input_tokens_seen": 153199360, + "router_z_loss_mlp": 0.33105469, + "step": 1848, + "time_per_iteration": 2.816774606704712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072989, + "balance_loss_mlp": 1.03977752, + "epoch": 0.3557137360523278, + "flos": 538305647616.0, + "grad_norm": 0.0578078380794177, + "language_loss": 0.87284935, + "learning_rate": 0.0007466650737656518, + "loss": 0.88357925, + "num_input_tokens_seen": 153269168, + "router_z_loss_mlp": 0.33203125, + "step": 1849, + "time_per_iteration": 2.611743927001953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075842, + "balance_loss_mlp": 1.04208231, + "epoch": 0.3559061177375914, + "flos": 402039230976.0, + "grad_norm": 0.05251231214094578, + "language_loss": 0.90093362, + "learning_rate": 0.0007463940332686098, + "loss": 0.91169202, + "num_input_tokens_seen": 153333120, + "router_z_loss_mlp": 0.33789062, + "step": 1850, + "time_per_iteration": 2.4692726135253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073866, + "balance_loss_mlp": 1.04017735, + "epoch": 0.35609849942285493, + "flos": 696238170624.0, + "grad_norm": 0.04795835093932571, + "language_loss": 0.84167922, + "learning_rate": 0.0007461228971129205, + "loss": 0.85241795, + "num_input_tokens_seen": 153407600, + "router_z_loss_mlp": 0.33691406, + "step": 1851, + "time_per_iteration": 2.894505023956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0106997, + "balance_loss_mlp": 1.0372591, + "epoch": 0.3562908811081185, + "flos": 568660953600.0, + "grad_norm": 0.055081415669052246, + "language_loss": 0.85513294, + "learning_rate": 0.0007458516654038483, + "loss": 0.86583257, + "num_input_tokens_seen": 153477408, + "router_z_loss_mlp": 0.32714844, + "step": 1852, + "time_per_iteration": 2.678018569946289 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076636, + "balance_loss_mlp": 1.0421133, + "epoch": 0.35648326279338205, + "flos": 682081219584.0, + "grad_norm": 0.04842584798560518, + "language_loss": 0.8668319, + "learning_rate": 0.0007455803382466946, + "loss": 0.87759829, + "num_input_tokens_seen": 153551888, + "router_z_loss_mlp": 0.34545898, + "step": 1853, + "time_per_iteration": 2.795799493789673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081318, + "balance_loss_mlp": 1.04674757, + "epoch": 0.35667564447864564, + "flos": 628840475136.0, + "grad_norm": 0.04891463031827082, + "language_loss": 0.87319207, + "learning_rate": 0.0007453089157467979, + "loss": 0.88400525, + "num_input_tokens_seen": 153626912, + "router_z_loss_mlp": 0.34594727, + "step": 1854, + "time_per_iteration": 2.7683348655700684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084525, + "balance_loss_mlp": 1.05000162, + "epoch": 0.35686802616390917, + "flos": 813685837824.0, + "grad_norm": 0.04901692214928195, + "language_loss": 0.81941634, + "learning_rate": 0.0007450373980095341, + "loss": 0.83026159, + "num_input_tokens_seen": 153711312, + "router_z_loss_mlp": 0.34545898, + "step": 1855, + "time_per_iteration": 3.069664716720581 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088761, + "balance_loss_mlp": 1.0541904, + "epoch": 0.35706040784917276, + "flos": 525922288128.0, + "grad_norm": 0.06393454459125486, + "language_loss": 0.86792582, + "learning_rate": 0.0007447657851403155, + "loss": 0.87881339, + "num_input_tokens_seen": 153780208, + "router_z_loss_mlp": 0.34619141, + "step": 1856, + "time_per_iteration": 2.6120662689208984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081268, + "balance_loss_mlp": 1.04793692, + "epoch": 0.35725278953443634, + "flos": 511698345984.0, + "grad_norm": 0.060959809088696394, + "language_loss": 0.78963649, + "learning_rate": 0.0007444940772445915, + "loss": 0.80044913, + "num_input_tokens_seen": 153853152, + "router_z_loss_mlp": 0.33349609, + "step": 1857, + "time_per_iteration": 2.802053689956665 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079861, + "balance_loss_mlp": 1.04653037, + "epoch": 0.3574451712196999, + "flos": 487162971648.0, + "grad_norm": 0.06448223618511208, + "language_loss": 0.80338144, + "learning_rate": 0.0007442222744278484, + "loss": 0.81418002, + "num_input_tokens_seen": 153924160, + "router_z_loss_mlp": 0.33349609, + "step": 1858, + "time_per_iteration": 2.660689353942871 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080529, + "balance_loss_mlp": 1.04765141, + "epoch": 0.35763755290496346, + "flos": 550384879104.0, + "grad_norm": 0.061962253699798436, + "language_loss": 0.84126002, + "learning_rate": 0.0007439503767956099, + "loss": 0.85206527, + "num_input_tokens_seen": 153998688, + "router_z_loss_mlp": 0.32885742, + "step": 1859, + "time_per_iteration": 2.7479875087738037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080547, + "balance_loss_mlp": 1.06767237, + "epoch": 0.357829934590227, + "flos": 1503300791808.0, + "grad_norm": 0.035903025748828234, + "language_loss": 0.79671603, + "learning_rate": 0.0007436783844534352, + "loss": 0.80752152, + "num_input_tokens_seen": 154230960, + "router_z_loss_mlp": 0.12890625, + "step": 1860, + "time_per_iteration": 4.900041580200195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077433, + "balance_loss_mlp": 1.04479325, + "epoch": 0.3580223162754906, + "flos": 568410670080.0, + "grad_norm": 0.040558802150678905, + "language_loss": 0.85799539, + "learning_rate": 0.000743406297506922, + "loss": 0.86876976, + "num_input_tokens_seen": 154309104, + "router_z_loss_mlp": 0.32641602, + "step": 1861, + "time_per_iteration": 2.701162576675415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084428, + "balance_loss_mlp": 1.05107355, + "epoch": 0.3582146979607541, + "flos": 626153089536.0, + "grad_norm": 0.04686630584337546, + "language_loss": 0.8419295, + "learning_rate": 0.0007431341160617031, + "loss": 0.85277379, + "num_input_tokens_seen": 154387424, + "router_z_loss_mlp": 0.33374023, + "step": 1862, + "time_per_iteration": 2.860173463821411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085309, + "balance_loss_mlp": 1.05266929, + "epoch": 0.3584070796460177, + "flos": 507010406400.0, + "grad_norm": 0.04939599291948986, + "language_loss": 0.88143289, + "learning_rate": 0.0007428618402234491, + "loss": 0.89228594, + "num_input_tokens_seen": 154459952, + "router_z_loss_mlp": 0.32641602, + "step": 1863, + "time_per_iteration": 2.62233567237854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083488, + "balance_loss_mlp": 1.05030036, + "epoch": 0.3585994613312813, + "flos": 606190763520.0, + "grad_norm": 0.051497717495276533, + "language_loss": 0.80248374, + "learning_rate": 0.0007425894700978668, + "loss": 0.81331861, + "num_input_tokens_seen": 154535456, + "router_z_loss_mlp": 0.33203125, + "step": 1864, + "time_per_iteration": 2.711484670639038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087216, + "balance_loss_mlp": 1.05424261, + "epoch": 0.3587918430165448, + "flos": 1412338484736.0, + "grad_norm": 0.047877863134497434, + "language_loss": 0.79232943, + "learning_rate": 0.0007423170057906996, + "loss": 0.80320162, + "num_input_tokens_seen": 154627568, + "router_z_loss_mlp": 0.32983398, + "step": 1865, + "time_per_iteration": 3.852776527404785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091453, + "balance_loss_mlp": 1.05821717, + "epoch": 0.3589842247018084, + "flos": 478313800704.0, + "grad_norm": 0.06431447428318769, + "language_loss": 0.85827845, + "learning_rate": 0.0007420444474077275, + "loss": 0.86919296, + "num_input_tokens_seen": 154694640, + "router_z_loss_mlp": 0.33251953, + "step": 1866, + "time_per_iteration": 2.6104037761688232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101796, + "balance_loss_mlp": 1.06829846, + "epoch": 0.35917660638707194, + "flos": 504464205312.0, + "grad_norm": 0.06438653143979521, + "language_loss": 0.89830631, + "learning_rate": 0.0007417717950547671, + "loss": 0.90932429, + "num_input_tokens_seen": 154762048, + "router_z_loss_mlp": 0.33520508, + "step": 1867, + "time_per_iteration": 2.5619330406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108848, + "balance_loss_mlp": 1.09687901, + "epoch": 0.3593689880723355, + "flos": 1491294191616.0, + "grad_norm": 0.037524520389889536, + "language_loss": 0.75996608, + "learning_rate": 0.0007414990488376713, + "loss": 0.77105457, + "num_input_tokens_seen": 154989952, + "router_z_loss_mlp": 0.11962891, + "step": 1868, + "time_per_iteration": 4.971943378448486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096427, + "balance_loss_mlp": 1.06388319, + "epoch": 0.35956136975759906, + "flos": 528369564672.0, + "grad_norm": 0.050983088733796166, + "language_loss": 0.84612024, + "learning_rate": 0.0007412262088623299, + "loss": 0.85708451, + "num_input_tokens_seen": 155066992, + "router_z_loss_mlp": 0.32543945, + "step": 1869, + "time_per_iteration": 2.7295072078704834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104656, + "balance_loss_mlp": 1.07142007, + "epoch": 0.35975375144286265, + "flos": 534647803392.0, + "grad_norm": 0.057848782745497714, + "language_loss": 0.79012549, + "learning_rate": 0.0007409532752346684, + "loss": 0.80117208, + "num_input_tokens_seen": 155137616, + "router_z_loss_mlp": 0.33251953, + "step": 1870, + "time_per_iteration": 2.74664568901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107858, + "balance_loss_mlp": 1.07464623, + "epoch": 0.3599461331281262, + "flos": 504695549952.0, + "grad_norm": 0.054621035664709404, + "language_loss": 0.88661271, + "learning_rate": 0.0007406802480606491, + "loss": 0.89769125, + "num_input_tokens_seen": 155209248, + "router_z_loss_mlp": 0.33227539, + "step": 1871, + "time_per_iteration": 2.636101722717285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102516, + "balance_loss_mlp": 1.06923246, + "epoch": 0.36013851481338977, + "flos": 511283708928.0, + "grad_norm": 0.05849515281409536, + "language_loss": 0.90592384, + "learning_rate": 0.0007404071274462707, + "loss": 0.91694903, + "num_input_tokens_seen": 155274176, + "router_z_loss_mlp": 0.33300781, + "step": 1872, + "time_per_iteration": 2.559588670730591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098641, + "balance_loss_mlp": 1.06533384, + "epoch": 0.36033089649865335, + "flos": 547330908672.0, + "grad_norm": 0.06237198940644659, + "language_loss": 0.8363173, + "learning_rate": 0.0007401339134975682, + "loss": 0.84730369, + "num_input_tokens_seen": 155343232, + "router_z_loss_mlp": 0.33325195, + "step": 1873, + "time_per_iteration": 2.6156845092773438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100197, + "balance_loss_mlp": 1.06617522, + "epoch": 0.3605232781839169, + "flos": 458416903680.0, + "grad_norm": 0.05108892475659159, + "language_loss": 0.84187275, + "learning_rate": 0.0007398606063206122, + "loss": 0.85287476, + "num_input_tokens_seen": 155410080, + "router_z_loss_mlp": 0.34033203, + "step": 1874, + "time_per_iteration": 2.6152756214141846 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090341, + "balance_loss_mlp": 1.05729628, + "epoch": 0.36071565986918047, + "flos": 509309296128.0, + "grad_norm": 0.05589329807105905, + "language_loss": 0.7857852, + "learning_rate": 0.0007395872060215101, + "loss": 0.79668868, + "num_input_tokens_seen": 155476240, + "router_z_loss_mlp": 0.33056641, + "step": 1875, + "time_per_iteration": 2.592906951904297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095107, + "balance_loss_mlp": 1.06230044, + "epoch": 0.360908041554444, + "flos": 558931484160.0, + "grad_norm": 0.12468103825296885, + "language_loss": 0.88329792, + "learning_rate": 0.0007393137127064056, + "loss": 0.89424896, + "num_input_tokens_seen": 155543392, + "router_z_loss_mlp": 0.328125, + "step": 1876, + "time_per_iteration": 2.629368782043457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109653, + "balance_loss_mlp": 1.06300879, + "epoch": 0.3611004232397076, + "flos": 523588492800.0, + "grad_norm": 0.05189881397754868, + "language_loss": 0.84167802, + "learning_rate": 0.0007390401264814779, + "loss": 0.85264337, + "num_input_tokens_seen": 155613264, + "router_z_loss_mlp": 0.33544922, + "step": 1877, + "time_per_iteration": 2.644322156906128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084049, + "balance_loss_mlp": 1.05179131, + "epoch": 0.3612928049249711, + "flos": 540728193024.0, + "grad_norm": 0.07312313725982984, + "language_loss": 0.84472072, + "learning_rate": 0.0007387664474529427, + "loss": 0.8555612, + "num_input_tokens_seen": 155683712, + "router_z_loss_mlp": 0.32250977, + "step": 1878, + "time_per_iteration": 2.6105034351348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094149, + "balance_loss_mlp": 1.06131935, + "epoch": 0.3614851866102347, + "flos": 552289480704.0, + "grad_norm": 0.06338398309504269, + "language_loss": 0.9129535, + "learning_rate": 0.0007384926757270518, + "loss": 0.923895, + "num_input_tokens_seen": 155751760, + "router_z_loss_mlp": 0.32836914, + "step": 1879, + "time_per_iteration": 2.6268200874328613 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082708, + "balance_loss_mlp": 1.05023539, + "epoch": 0.36167756829549824, + "flos": 771734338560.0, + "grad_norm": 0.048672507925477976, + "language_loss": 0.79680419, + "learning_rate": 0.0007382188114100924, + "loss": 0.80763125, + "num_input_tokens_seen": 155830464, + "router_z_loss_mlp": 0.32470703, + "step": 1880, + "time_per_iteration": 2.9548373222351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078144, + "balance_loss_mlp": 1.04509938, + "epoch": 0.36186994998076183, + "flos": 711560609280.0, + "grad_norm": 0.04804943389379678, + "language_loss": 0.81544787, + "learning_rate": 0.0007379448546083884, + "loss": 0.82622933, + "num_input_tokens_seen": 155906208, + "router_z_loss_mlp": 0.33056641, + "step": 1881, + "time_per_iteration": 2.900480031967163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082413, + "balance_loss_mlp": 1.04884315, + "epoch": 0.3620623316660254, + "flos": 747209138688.0, + "grad_norm": 0.049920719635936736, + "language_loss": 0.88019323, + "learning_rate": 0.0007376708054282992, + "loss": 0.89101738, + "num_input_tokens_seen": 155983584, + "router_z_loss_mlp": 0.3359375, + "step": 1882, + "time_per_iteration": 2.9482829570770264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075385, + "balance_loss_mlp": 1.04286492, + "epoch": 0.36225471335128895, + "flos": 482312088576.0, + "grad_norm": 0.04692483307288239, + "language_loss": 0.83908749, + "learning_rate": 0.0007373966639762201, + "loss": 0.84984136, + "num_input_tokens_seen": 156052464, + "router_z_loss_mlp": 0.32519531, + "step": 1883, + "time_per_iteration": 2.597809076309204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078049, + "balance_loss_mlp": 1.0448606, + "epoch": 0.36244709503655254, + "flos": 506655406080.0, + "grad_norm": 0.0703007209611724, + "language_loss": 0.8835175, + "learning_rate": 0.0007371224303585822, + "loss": 0.89429802, + "num_input_tokens_seen": 156121424, + "router_z_loss_mlp": 0.33203125, + "step": 1884, + "time_per_iteration": 2.5686471462249756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01046189, + "balance_loss_mlp": 1.03360081, + "epoch": 0.36263947672181607, + "flos": 1393302643200.0, + "grad_norm": 0.020620128786032376, + "language_loss": 0.80357069, + "learning_rate": 0.0007368481046818524, + "loss": 0.81403255, + "num_input_tokens_seen": 156346144, + "router_z_loss_mlp": 0.12597656, + "step": 1885, + "time_per_iteration": 4.68831205368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073343, + "balance_loss_mlp": 1.03939199, + "epoch": 0.36283185840707965, + "flos": 652991735808.0, + "grad_norm": 0.05943029236677907, + "language_loss": 0.82845902, + "learning_rate": 0.0007365736870525335, + "loss": 0.83919251, + "num_input_tokens_seen": 156420880, + "router_z_loss_mlp": 0.33959961, + "step": 1886, + "time_per_iteration": 2.8566346168518066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070323, + "balance_loss_mlp": 1.0373739, + "epoch": 0.3630242400923432, + "flos": 488619440640.0, + "grad_norm": 0.06703223685064427, + "language_loss": 0.82574463, + "learning_rate": 0.000736299177577164, + "loss": 0.83644783, + "num_input_tokens_seen": 156485616, + "router_z_loss_mlp": 0.32958984, + "step": 1887, + "time_per_iteration": 2.5848894119262695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074233, + "balance_loss_mlp": 1.04130709, + "epoch": 0.3632166217776068, + "flos": 516892644864.0, + "grad_norm": 0.0626455482667494, + "language_loss": 0.83844066, + "learning_rate": 0.0007360245763623174, + "loss": 0.84918302, + "num_input_tokens_seen": 156557840, + "router_z_loss_mlp": 0.3293457, + "step": 1888, + "time_per_iteration": 2.6179397106170654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01067525, + "balance_loss_mlp": 1.03564882, + "epoch": 0.36340900346287036, + "flos": 645881250816.0, + "grad_norm": 0.06111369810549259, + "language_loss": 0.89794236, + "learning_rate": 0.0007357498835146039, + "loss": 0.90861762, + "num_input_tokens_seen": 156632496, + "router_z_loss_mlp": 0.31860352, + "step": 1889, + "time_per_iteration": 2.8311662673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070281, + "balance_loss_mlp": 1.03854752, + "epoch": 0.3636013851481339, + "flos": 553057708032.0, + "grad_norm": 0.0568549422608731, + "language_loss": 0.86402494, + "learning_rate": 0.0007354750991406684, + "loss": 0.87472773, + "num_input_tokens_seen": 156705296, + "router_z_loss_mlp": 0.31713867, + "step": 1890, + "time_per_iteration": 2.6922197341918945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072041, + "balance_loss_mlp": 1.03952074, + "epoch": 0.3637937668333975, + "flos": 546395355648.0, + "grad_norm": 0.053455628499382915, + "language_loss": 0.80957252, + "learning_rate": 0.0007352002233471919, + "loss": 0.82029295, + "num_input_tokens_seen": 156773376, + "router_z_loss_mlp": 0.32519531, + "step": 1891, + "time_per_iteration": 2.621241569519043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066281, + "balance_loss_mlp": 1.03371286, + "epoch": 0.363986148518661, + "flos": 537838576128.0, + "grad_norm": 0.07508945751401845, + "language_loss": 0.79549944, + "learning_rate": 0.0007349252562408906, + "loss": 0.80616224, + "num_input_tokens_seen": 156844336, + "router_z_loss_mlp": 0.32543945, + "step": 1892, + "time_per_iteration": 2.674318552017212 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069175, + "balance_loss_mlp": 1.0372504, + "epoch": 0.3641785302039246, + "flos": 659895607296.0, + "grad_norm": 0.04761623500947703, + "language_loss": 0.81258041, + "learning_rate": 0.0007346501979285158, + "loss": 0.82327211, + "num_input_tokens_seen": 156918848, + "router_z_loss_mlp": 0.3190918, + "step": 1893, + "time_per_iteration": 2.8671226501464844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01035364, + "balance_loss_mlp": 1.0238719, + "epoch": 0.36437091188918813, + "flos": 1467911158272.0, + "grad_norm": 0.02143179240630706, + "language_loss": 0.80539101, + "learning_rate": 0.0007343750485168551, + "loss": 0.81574464, + "num_input_tokens_seen": 157134736, + "router_z_loss_mlp": 0.11474609, + "step": 1894, + "time_per_iteration": 4.7720019817352295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075531, + "balance_loss_mlp": 1.04248571, + "epoch": 0.3645632935744517, + "flos": 597012733440.0, + "grad_norm": 0.049808711864839754, + "language_loss": 0.85850054, + "learning_rate": 0.0007340998081127308, + "loss": 0.8692559, + "num_input_tokens_seen": 157211920, + "router_z_loss_mlp": 0.33056641, + "step": 1895, + "time_per_iteration": 2.753730058670044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081037, + "balance_loss_mlp": 1.0486834, + "epoch": 0.36475567525971525, + "flos": 599214108672.0, + "grad_norm": 0.05384470863640996, + "language_loss": 0.9063257, + "learning_rate": 0.0007338244768230007, + "loss": 0.91713607, + "num_input_tokens_seen": 157284224, + "router_z_loss_mlp": 0.32348633, + "step": 1896, + "time_per_iteration": 2.749844551086426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084345, + "balance_loss_mlp": 1.05189669, + "epoch": 0.36494805694497884, + "flos": 798047686656.0, + "grad_norm": 0.12041633701688108, + "language_loss": 0.88843018, + "learning_rate": 0.0007335490547545578, + "loss": 0.89927363, + "num_input_tokens_seen": 157367920, + "router_z_loss_mlp": 0.32446289, + "step": 1897, + "time_per_iteration": 3.0181519985198975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089996, + "balance_loss_mlp": 1.05795288, + "epoch": 0.3651404386302424, + "flos": 637023315456.0, + "grad_norm": 0.06340749789439089, + "language_loss": 0.82377589, + "learning_rate": 0.0007332735420143308, + "loss": 0.83467579, + "num_input_tokens_seen": 157438672, + "router_z_loss_mlp": 0.3203125, + "step": 1898, + "time_per_iteration": 2.7370855808258057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077681, + "balance_loss_mlp": 1.04442132, + "epoch": 0.36533282031550596, + "flos": 491337349632.0, + "grad_norm": 0.05751458989837244, + "language_loss": 0.8663426, + "learning_rate": 0.0007329979387092826, + "loss": 0.87711942, + "num_input_tokens_seen": 157505888, + "router_z_loss_mlp": 0.33276367, + "step": 1899, + "time_per_iteration": 2.552072048187256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076766, + "balance_loss_mlp": 1.0440551, + "epoch": 0.36552520200076954, + "flos": 855587874816.0, + "grad_norm": 0.050366197091212025, + "language_loss": 0.83863711, + "learning_rate": 0.0007327222449464124, + "loss": 0.84940481, + "num_input_tokens_seen": 157601568, + "router_z_loss_mlp": 0.32714844, + "step": 1900, + "time_per_iteration": 3.2450594902038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072697, + "balance_loss_mlp": 1.03936601, + "epoch": 0.3657175836860331, + "flos": 483449872896.0, + "grad_norm": 0.053278478248789174, + "language_loss": 0.88864619, + "learning_rate": 0.0007324464608327538, + "loss": 0.89937317, + "num_input_tokens_seen": 157670992, + "router_z_loss_mlp": 0.33349609, + "step": 1901, + "time_per_iteration": 2.6027348041534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01072333, + "balance_loss_mlp": 1.03957391, + "epoch": 0.36590996537129666, + "flos": 434561006592.0, + "grad_norm": 0.058664113400220264, + "language_loss": 0.88440275, + "learning_rate": 0.0007321705864753758, + "loss": 0.8951261, + "num_input_tokens_seen": 157743616, + "router_z_loss_mlp": 0.32763672, + "step": 1902, + "time_per_iteration": 2.668935537338257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073172, + "balance_loss_mlp": 1.03950715, + "epoch": 0.3661023470565602, + "flos": 711880704000.0, + "grad_norm": 0.047699393186438684, + "language_loss": 0.84307706, + "learning_rate": 0.0007318946219813823, + "loss": 0.85380876, + "num_input_tokens_seen": 157823520, + "router_z_loss_mlp": 0.33691406, + "step": 1903, + "time_per_iteration": 3.025866985321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074477, + "balance_loss_mlp": 1.04100275, + "epoch": 0.3662947287418238, + "flos": 564495340032.0, + "grad_norm": 0.05797091317965262, + "language_loss": 0.90078342, + "learning_rate": 0.000731618567457912, + "loss": 0.91152817, + "num_input_tokens_seen": 157893248, + "router_z_loss_mlp": 0.3347168, + "step": 1904, + "time_per_iteration": 2.6391115188598633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073585, + "balance_loss_mlp": 1.03937197, + "epoch": 0.3664871104270873, + "flos": 789391982592.0, + "grad_norm": 0.05925410463566973, + "language_loss": 0.87083924, + "learning_rate": 0.000731342423012139, + "loss": 0.88157511, + "num_input_tokens_seen": 157973216, + "router_z_loss_mlp": 0.3425293, + "step": 1905, + "time_per_iteration": 3.020660400390625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070848, + "balance_loss_mlp": 1.03682566, + "epoch": 0.3666794921123509, + "flos": 752202616320.0, + "grad_norm": 0.06601024748857935, + "language_loss": 0.82244205, + "learning_rate": 0.0007310661887512722, + "loss": 0.83315057, + "num_input_tokens_seen": 158051088, + "router_z_loss_mlp": 0.34033203, + "step": 1906, + "time_per_iteration": 3.0128185749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068568, + "balance_loss_mlp": 1.03571391, + "epoch": 0.3668718737976145, + "flos": 523264015872.0, + "grad_norm": 0.04853340441162438, + "language_loss": 0.82115662, + "learning_rate": 0.0007307898647825549, + "loss": 0.8318423, + "num_input_tokens_seen": 158124368, + "router_z_loss_mlp": 0.32861328, + "step": 1907, + "time_per_iteration": 2.6610257625579834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075142, + "balance_loss_mlp": 1.04126275, + "epoch": 0.367064255482878, + "flos": 571698957312.0, + "grad_norm": 0.05773956677348378, + "language_loss": 0.89470363, + "learning_rate": 0.0007305134512132659, + "loss": 0.90545505, + "num_input_tokens_seen": 158191472, + "router_z_loss_mlp": 0.33886719, + "step": 1908, + "time_per_iteration": 2.706658124923706 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076976, + "balance_loss_mlp": 1.04309678, + "epoch": 0.3672566371681416, + "flos": 446880347136.0, + "grad_norm": 0.0894454707503668, + "language_loss": 0.83388865, + "learning_rate": 0.0007302369481507183, + "loss": 0.84465849, + "num_input_tokens_seen": 158254384, + "router_z_loss_mlp": 0.33911133, + "step": 1909, + "time_per_iteration": 2.499483346939087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01049259, + "balance_loss_mlp": 1.03838694, + "epoch": 0.36744901885340514, + "flos": 1539275208192.0, + "grad_norm": 0.032302576214162944, + "language_loss": 0.79961759, + "learning_rate": 0.00072996035570226, + "loss": 0.81011015, + "num_input_tokens_seen": 158486160, + "router_z_loss_mlp": 0.10888672, + "step": 1910, + "time_per_iteration": 4.86514949798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088721, + "balance_loss_mlp": 1.05476999, + "epoch": 0.36764140053866873, + "flos": 563417192448.0, + "grad_norm": 0.061805914783829616, + "language_loss": 0.85575247, + "learning_rate": 0.000729683673975274, + "loss": 0.86663967, + "num_input_tokens_seen": 158555616, + "router_z_loss_mlp": 0.33984375, + "step": 1911, + "time_per_iteration": 2.6907522678375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091619, + "balance_loss_mlp": 1.05709589, + "epoch": 0.36783378222393226, + "flos": 1216168971264.0, + "grad_norm": 0.04498413319979697, + "language_loss": 0.82746279, + "learning_rate": 0.0007294069030771774, + "loss": 0.83837891, + "num_input_tokens_seen": 158653984, + "router_z_loss_mlp": 0.34570312, + "step": 1912, + "time_per_iteration": 3.6445353031158447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109623, + "balance_loss_mlp": 1.06196952, + "epoch": 0.36802616390919585, + "flos": 498476947968.0, + "grad_norm": 0.055898807174015214, + "language_loss": 0.90671504, + "learning_rate": 0.0007291300431154224, + "loss": 0.9176774, + "num_input_tokens_seen": 158719728, + "router_z_loss_mlp": 0.34301758, + "step": 1913, + "time_per_iteration": 2.5600366592407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01025736, + "balance_loss_mlp": 1.0155319, + "epoch": 0.36821854559445943, + "flos": 1581281961984.0, + "grad_norm": 0.015307788275572325, + "language_loss": 0.70389736, + "learning_rate": 0.0007288530941974955, + "loss": 0.71415472, + "num_input_tokens_seen": 158952544, + "router_z_loss_mlp": 0.10205078, + "step": 1914, + "time_per_iteration": 4.9577555656433105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110633, + "balance_loss_mlp": 1.07287991, + "epoch": 0.36841092727972297, + "flos": 835261784064.0, + "grad_norm": 0.06223209716338702, + "language_loss": 0.79735458, + "learning_rate": 0.0007285760564309179, + "loss": 0.80841786, + "num_input_tokens_seen": 159039680, + "router_z_loss_mlp": 0.3347168, + "step": 1915, + "time_per_iteration": 3.1251590251922607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101251, + "balance_loss_mlp": 1.06811082, + "epoch": 0.36860330896498655, + "flos": 689517591552.0, + "grad_norm": 0.05672479428696366, + "language_loss": 0.85010201, + "learning_rate": 0.0007282989299232448, + "loss": 0.8611145, + "num_input_tokens_seen": 159128128, + "router_z_loss_mlp": 0.33154297, + "step": 1916, + "time_per_iteration": 3.062971353530884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096553, + "balance_loss_mlp": 1.06381774, + "epoch": 0.3687956906502501, + "flos": 553919067648.0, + "grad_norm": 0.05955658020637064, + "language_loss": 0.83600092, + "learning_rate": 0.0007280217147820668, + "loss": 0.84696645, + "num_input_tokens_seen": 159193248, + "router_z_loss_mlp": 0.32739258, + "step": 1917, + "time_per_iteration": 2.6169495582580566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097845, + "balance_loss_mlp": 1.06465673, + "epoch": 0.3689880723355137, + "flos": 576426184704.0, + "grad_norm": 0.05443515430960571, + "language_loss": 0.79137111, + "learning_rate": 0.0007277444111150079, + "loss": 0.80234957, + "num_input_tokens_seen": 159265824, + "router_z_loss_mlp": 0.33203125, + "step": 1918, + "time_per_iteration": 2.672696828842163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101204, + "balance_loss_mlp": 1.06820679, + "epoch": 0.3691804540207772, + "flos": 528615465984.0, + "grad_norm": 0.06564490716140688, + "language_loss": 0.84340626, + "learning_rate": 0.0007274670190297272, + "loss": 0.85441828, + "num_input_tokens_seen": 159332992, + "router_z_loss_mlp": 0.33007812, + "step": 1919, + "time_per_iteration": 2.569920539855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091279, + "balance_loss_mlp": 1.0575906, + "epoch": 0.3693728357060408, + "flos": 560729806848.0, + "grad_norm": 0.06475948680742319, + "language_loss": 0.81988895, + "learning_rate": 0.0007271895386339179, + "loss": 0.83080173, + "num_input_tokens_seen": 159409808, + "router_z_loss_mlp": 0.3371582, + "step": 1920, + "time_per_iteration": 2.765470027923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086642, + "balance_loss_mlp": 1.05350161, + "epoch": 0.3695652173913043, + "flos": 579488919552.0, + "grad_norm": 0.0536525739451854, + "language_loss": 0.82950377, + "learning_rate": 0.0007269119700353073, + "loss": 0.84037018, + "num_input_tokens_seen": 159486128, + "router_z_loss_mlp": 0.33154297, + "step": 1921, + "time_per_iteration": 2.703117847442627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089729, + "balance_loss_mlp": 1.05756688, + "epoch": 0.3697575990765679, + "flos": 512629516800.0, + "grad_norm": 0.04104943724396866, + "language_loss": 0.84983069, + "learning_rate": 0.0007266343133416571, + "loss": 0.86072791, + "num_input_tokens_seen": 159562224, + "router_z_loss_mlp": 0.3215332, + "step": 1922, + "time_per_iteration": 2.7371909618377686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01060973, + "balance_loss_mlp": 1.04967153, + "epoch": 0.3699499807618315, + "flos": 1569826953216.0, + "grad_norm": 0.023907464900796205, + "language_loss": 0.77116919, + "learning_rate": 0.0007263565686607632, + "loss": 0.78177893, + "num_input_tokens_seen": 159784768, + "router_z_loss_mlp": 0.11279297, + "step": 1923, + "time_per_iteration": 4.827981233596802 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084577, + "balance_loss_mlp": 1.05136538, + "epoch": 0.37014236244709503, + "flos": 497093262336.0, + "grad_norm": 0.06739223877154035, + "language_loss": 0.84575641, + "learning_rate": 0.0007260787361004556, + "loss": 0.85660219, + "num_input_tokens_seen": 159848608, + "router_z_loss_mlp": 0.33227539, + "step": 1924, + "time_per_iteration": 2.6221601963043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01048708, + "balance_loss_mlp": 1.03764546, + "epoch": 0.3703347441323586, + "flos": 1443562048512.0, + "grad_norm": 0.02017040526472397, + "language_loss": 0.73761505, + "learning_rate": 0.0007258008157685987, + "loss": 0.74810213, + "num_input_tokens_seen": 160080928, + "router_z_loss_mlp": 0.11083984, + "step": 1925, + "time_per_iteration": 4.909639120101929 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083689, + "balance_loss_mlp": 1.04997683, + "epoch": 0.37052712581762215, + "flos": 563324060160.0, + "grad_norm": 0.19489786122972683, + "language_loss": 0.87265027, + "learning_rate": 0.0007255228077730903, + "loss": 0.88348716, + "num_input_tokens_seen": 160148976, + "router_z_loss_mlp": 0.33740234, + "step": 1926, + "time_per_iteration": 2.726412773132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092339, + "balance_loss_mlp": 1.05850744, + "epoch": 0.37071950750288574, + "flos": 925706451456.0, + "grad_norm": 0.06639539702607969, + "language_loss": 0.81730163, + "learning_rate": 0.0007252447122218632, + "loss": 0.82822502, + "num_input_tokens_seen": 160233504, + "router_z_loss_mlp": 0.33862305, + "step": 1927, + "time_per_iteration": 3.157439708709717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090678, + "balance_loss_mlp": 1.05710912, + "epoch": 0.37091188918814927, + "flos": 418090609152.0, + "grad_norm": 0.06586667444600991, + "language_loss": 0.87736213, + "learning_rate": 0.0007249665292228834, + "loss": 0.88826889, + "num_input_tokens_seen": 160299696, + "router_z_loss_mlp": 0.3359375, + "step": 1928, + "time_per_iteration": 2.569284677505493 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086968, + "balance_loss_mlp": 1.05208778, + "epoch": 0.37110427087341286, + "flos": 462941899776.0, + "grad_norm": 0.056849308308669105, + "language_loss": 0.83676869, + "learning_rate": 0.000724688258884151, + "loss": 0.84763837, + "num_input_tokens_seen": 160367904, + "router_z_loss_mlp": 0.34912109, + "step": 1929, + "time_per_iteration": 2.522596597671509 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085634, + "balance_loss_mlp": 1.05204105, + "epoch": 0.3712966525586764, + "flos": 849303843840.0, + "grad_norm": 0.0484214736208702, + "language_loss": 0.86208755, + "learning_rate": 0.0007244099013137002, + "loss": 0.87294388, + "num_input_tokens_seen": 160453600, + "router_z_loss_mlp": 0.33618164, + "step": 1930, + "time_per_iteration": 3.055302619934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089345, + "balance_loss_mlp": 1.05370176, + "epoch": 0.37148903424394, + "flos": 925555092480.0, + "grad_norm": 0.05147814185741214, + "language_loss": 0.88918859, + "learning_rate": 0.0007241314566195993, + "loss": 0.90008199, + "num_input_tokens_seen": 160543472, + "router_z_loss_mlp": 0.35693359, + "step": 1931, + "time_per_iteration": 3.249950408935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108542, + "balance_loss_mlp": 1.05020559, + "epoch": 0.37168141592920356, + "flos": 519565473792.0, + "grad_norm": 0.061459583473066896, + "language_loss": 0.85347825, + "learning_rate": 0.0007238529249099496, + "loss": 0.86433244, + "num_input_tokens_seen": 160614016, + "router_z_loss_mlp": 0.35253906, + "step": 1932, + "time_per_iteration": 2.603287696838379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01068675, + "balance_loss_mlp": 1.05599129, + "epoch": 0.3718737976144671, + "flos": 1445107267584.0, + "grad_norm": 0.02721294021284605, + "language_loss": 0.77856874, + "learning_rate": 0.0007235743062928872, + "loss": 0.7892555, + "num_input_tokens_seen": 160828640, + "router_z_loss_mlp": 0.12695312, + "step": 1933, + "time_per_iteration": 4.850685358047485 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089729, + "balance_loss_mlp": 1.05346537, + "epoch": 0.3720661792997307, + "flos": 759218558976.0, + "grad_norm": 0.08735029164116491, + "language_loss": 0.80658156, + "learning_rate": 0.000723295600876581, + "loss": 0.81747884, + "num_input_tokens_seen": 160913088, + "router_z_loss_mlp": 0.36279297, + "step": 1934, + "time_per_iteration": 3.0082099437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092646, + "balance_loss_mlp": 1.05690694, + "epoch": 0.3722585609849942, + "flos": 516686031360.0, + "grad_norm": 0.1760204301041219, + "language_loss": 0.8798061, + "learning_rate": 0.0007230168087692344, + "loss": 0.89073259, + "num_input_tokens_seen": 160982960, + "router_z_loss_mlp": 0.35791016, + "step": 1935, + "time_per_iteration": 2.7076756954193115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095918, + "balance_loss_mlp": 1.06070328, + "epoch": 0.3724509426702578, + "flos": 782114171904.0, + "grad_norm": 0.058450977170247324, + "language_loss": 0.82290804, + "learning_rate": 0.0007227379300790839, + "loss": 0.83386725, + "num_input_tokens_seen": 161066000, + "router_z_loss_mlp": 0.35205078, + "step": 1936, + "time_per_iteration": 3.0381107330322266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108913, + "balance_loss_mlp": 1.07262528, + "epoch": 0.37264332435552133, + "flos": 391502246400.0, + "grad_norm": 0.062314619417064634, + "language_loss": 0.85779369, + "learning_rate": 0.0007224589649143997, + "loss": 0.86888283, + "num_input_tokens_seen": 161131040, + "router_z_loss_mlp": 0.36328125, + "step": 1937, + "time_per_iteration": 2.5413918495178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118094, + "balance_loss_mlp": 1.08223581, + "epoch": 0.3728357060407849, + "flos": 542599299072.0, + "grad_norm": 0.08241458585549921, + "language_loss": 0.80921531, + "learning_rate": 0.0007221799133834861, + "loss": 0.82039624, + "num_input_tokens_seen": 161201248, + "router_z_loss_mlp": 0.35839844, + "step": 1938, + "time_per_iteration": 2.620593309402466 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122902, + "balance_loss_mlp": 1.08682895, + "epoch": 0.3730280877260485, + "flos": 433344646656.0, + "grad_norm": 0.05702640818290307, + "language_loss": 0.81373966, + "learning_rate": 0.00072190077559468, + "loss": 0.8249687, + "num_input_tokens_seen": 161266288, + "router_z_loss_mlp": 0.36083984, + "step": 1939, + "time_per_iteration": 2.512871026992798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133095, + "balance_loss_mlp": 1.09587836, + "epoch": 0.37322046941131204, + "flos": 531230068224.0, + "grad_norm": 0.0616329871980105, + "language_loss": 0.89228082, + "learning_rate": 0.0007216215516563527, + "loss": 0.90361178, + "num_input_tokens_seen": 161335648, + "router_z_loss_mlp": 0.37207031, + "step": 1940, + "time_per_iteration": 2.6655144691467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113311, + "balance_loss_mlp": 1.09534478, + "epoch": 0.3734128510965756, + "flos": 531294087168.0, + "grad_norm": 0.05659412158312536, + "language_loss": 0.83479297, + "learning_rate": 0.0007213422416769083, + "loss": 0.84612405, + "num_input_tokens_seen": 161403440, + "router_z_loss_mlp": 0.37744141, + "step": 1941, + "time_per_iteration": 2.6088144779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127109, + "balance_loss_mlp": 1.09022546, + "epoch": 0.37360523278183916, + "flos": 500195284992.0, + "grad_norm": 0.05910558413712496, + "language_loss": 0.74991721, + "learning_rate": 0.0007210628457647849, + "loss": 0.76118833, + "num_input_tokens_seen": 161472864, + "router_z_loss_mlp": 0.36889648, + "step": 1942, + "time_per_iteration": 2.57867169380188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131479, + "balance_loss_mlp": 1.09366596, + "epoch": 0.37379761446710275, + "flos": 547652413440.0, + "grad_norm": 0.06364761456819781, + "language_loss": 0.79148316, + "learning_rate": 0.000720783364028453, + "loss": 0.80279785, + "num_input_tokens_seen": 161548096, + "router_z_loss_mlp": 0.37768555, + "step": 1943, + "time_per_iteration": 2.744575023651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121556, + "balance_loss_mlp": 1.0834806, + "epoch": 0.3739899961523663, + "flos": 475517316096.0, + "grad_norm": 0.05406366318559307, + "language_loss": 0.87411249, + "learning_rate": 0.0007205037965764177, + "loss": 0.88532799, + "num_input_tokens_seen": 161615600, + "router_z_loss_mlp": 0.38061523, + "step": 1944, + "time_per_iteration": 2.5253238677978516 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122588, + "balance_loss_mlp": 1.0851568, + "epoch": 0.37418237783762986, + "flos": 611626581504.0, + "grad_norm": 0.05571778703090581, + "language_loss": 0.85614675, + "learning_rate": 0.0007202241435172161, + "loss": 0.86737263, + "num_input_tokens_seen": 161687408, + "router_z_loss_mlp": 0.37426758, + "step": 1945, + "time_per_iteration": 2.7462716102600098 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117651, + "balance_loss_mlp": 1.07931328, + "epoch": 0.3743747595228934, + "flos": 765953694720.0, + "grad_norm": 0.05225192391609906, + "language_loss": 0.88148731, + "learning_rate": 0.0007199444049594198, + "loss": 0.89266384, + "num_input_tokens_seen": 161764224, + "router_z_loss_mlp": 0.38330078, + "step": 1946, + "time_per_iteration": 2.9533469676971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116752, + "balance_loss_mlp": 1.07798529, + "epoch": 0.374567141208157, + "flos": 524120993280.0, + "grad_norm": 0.06150523549490838, + "language_loss": 0.83402771, + "learning_rate": 0.0007196645810116322, + "loss": 0.84519523, + "num_input_tokens_seen": 161835520, + "router_z_loss_mlp": 0.38720703, + "step": 1947, + "time_per_iteration": 2.709965705871582 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106001, + "balance_loss_mlp": 1.06735349, + "epoch": 0.37475952289342057, + "flos": 681067090944.0, + "grad_norm": 0.05833074938802531, + "language_loss": 0.83909506, + "learning_rate": 0.0007193846717824912, + "loss": 0.850155, + "num_input_tokens_seen": 161912000, + "router_z_loss_mlp": 0.38598633, + "step": 1948, + "time_per_iteration": 2.854522705078125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094783, + "balance_loss_mlp": 1.05682671, + "epoch": 0.3749519045786841, + "flos": 460061047296.0, + "grad_norm": 0.06673844071937801, + "language_loss": 0.88263041, + "learning_rate": 0.0007191046773806669, + "loss": 0.89357823, + "num_input_tokens_seen": 161977296, + "router_z_loss_mlp": 0.37915039, + "step": 1949, + "time_per_iteration": 2.575989007949829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085738, + "balance_loss_mlp": 1.04682803, + "epoch": 0.3751442862639477, + "flos": 954471458304.0, + "grad_norm": 0.06638817682476543, + "language_loss": 0.83010924, + "learning_rate": 0.0007188245979148631, + "loss": 0.84096658, + "num_input_tokens_seen": 162051888, + "router_z_loss_mlp": 0.38867188, + "step": 1950, + "time_per_iteration": 3.1386518478393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082888, + "balance_loss_mlp": 1.04462171, + "epoch": 0.3753366679492112, + "flos": 527483473920.0, + "grad_norm": 0.05996025340147905, + "language_loss": 0.8766306, + "learning_rate": 0.0007185444334938157, + "loss": 0.88745946, + "num_input_tokens_seen": 162124384, + "router_z_loss_mlp": 0.38232422, + "step": 1951, + "time_per_iteration": 2.644848585128784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082635, + "balance_loss_mlp": 1.04501283, + "epoch": 0.3755290496344748, + "flos": 521535504384.0, + "grad_norm": 0.05938829335869994, + "language_loss": 0.84891546, + "learning_rate": 0.0007182641842262947, + "loss": 0.85974181, + "num_input_tokens_seen": 162191440, + "router_z_loss_mlp": 0.3762207, + "step": 1952, + "time_per_iteration": 2.6239395141601562 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081192, + "balance_loss_mlp": 1.04361689, + "epoch": 0.37572143131973834, + "flos": 620810403840.0, + "grad_norm": 0.06544951097265184, + "language_loss": 0.77827752, + "learning_rate": 0.0007179838502211022, + "loss": 0.78908944, + "num_input_tokens_seen": 162268480, + "router_z_loss_mlp": 0.37524414, + "step": 1953, + "time_per_iteration": 2.8444712162017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076992, + "balance_loss_mlp": 1.03958416, + "epoch": 0.37591381300500193, + "flos": 770635842048.0, + "grad_norm": 0.05616797515781331, + "language_loss": 0.86183697, + "learning_rate": 0.0007177034315870738, + "loss": 0.87260687, + "num_input_tokens_seen": 162346752, + "router_z_loss_mlp": 0.37402344, + "step": 1954, + "time_per_iteration": 2.9628727436065674 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078673, + "balance_loss_mlp": 1.0411936, + "epoch": 0.37610619469026546, + "flos": 520191106560.0, + "grad_norm": 0.05872311076525267, + "language_loss": 0.9098376, + "learning_rate": 0.0007174229284330773, + "loss": 0.92062426, + "num_input_tokens_seen": 162415120, + "router_z_loss_mlp": 0.37402344, + "step": 1955, + "time_per_iteration": 2.579792022705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010784, + "balance_loss_mlp": 1.0412302, + "epoch": 0.37629857637552905, + "flos": 598524456960.0, + "grad_norm": 0.050284285498010506, + "language_loss": 0.86896843, + "learning_rate": 0.0007171423408680141, + "loss": 0.8797524, + "num_input_tokens_seen": 162493280, + "router_z_loss_mlp": 0.37133789, + "step": 1956, + "time_per_iteration": 2.7764384746551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079026, + "balance_loss_mlp": 1.04211903, + "epoch": 0.37649095806079264, + "flos": 564687396864.0, + "grad_norm": 0.058102078307858664, + "language_loss": 0.89614129, + "learning_rate": 0.0007168616690008176, + "loss": 0.90693152, + "num_input_tokens_seen": 162560736, + "router_z_loss_mlp": 0.36889648, + "step": 1957, + "time_per_iteration": 2.646986246109009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083688, + "balance_loss_mlp": 1.04606569, + "epoch": 0.37668333974605617, + "flos": 592196755968.0, + "grad_norm": 0.10223927136981294, + "language_loss": 0.86142451, + "learning_rate": 0.0007165809129404545, + "loss": 0.8722614, + "num_input_tokens_seen": 162630688, + "router_z_loss_mlp": 0.37573242, + "step": 1958, + "time_per_iteration": 2.756287097930908 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081219, + "balance_loss_mlp": 1.04440713, + "epoch": 0.37687572143131975, + "flos": 419257506816.0, + "grad_norm": 0.0560584683493853, + "language_loss": 0.85760534, + "learning_rate": 0.0007163000727959239, + "loss": 0.8684175, + "num_input_tokens_seen": 162694304, + "router_z_loss_mlp": 0.36791992, + "step": 1959, + "time_per_iteration": 2.5415151119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0105237, + "balance_loss_mlp": 1.03587127, + "epoch": 0.3770681031165833, + "flos": 1356484243968.0, + "grad_norm": 0.028402472736143748, + "language_loss": 0.77959073, + "learning_rate": 0.0007160191486762575, + "loss": 0.7901144, + "num_input_tokens_seen": 162920336, + "router_z_loss_mlp": 0.16503906, + "step": 1960, + "time_per_iteration": 4.892657518386841 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086261, + "balance_loss_mlp": 1.04892445, + "epoch": 0.3772604848018469, + "flos": 644592107520.0, + "grad_norm": 0.04530874218926827, + "language_loss": 0.84377986, + "learning_rate": 0.00071573814069052, + "loss": 0.85464251, + "num_input_tokens_seen": 163000720, + "router_z_loss_mlp": 0.37329102, + "step": 1961, + "time_per_iteration": 2.898301839828491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088156, + "balance_loss_mlp": 1.05098641, + "epoch": 0.3774528664871104, + "flos": 901265619456.0, + "grad_norm": 0.052585227845940184, + "language_loss": 0.87987518, + "learning_rate": 0.0007154570489478081, + "loss": 0.89075673, + "num_input_tokens_seen": 163085680, + "router_z_loss_mlp": 0.37158203, + "step": 1962, + "time_per_iteration": 3.1638717651367188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088198, + "balance_loss_mlp": 1.05048013, + "epoch": 0.377645248172374, + "flos": 787717315584.0, + "grad_norm": 0.047624248218528294, + "language_loss": 0.864995, + "learning_rate": 0.0007151758735572514, + "loss": 0.87587702, + "num_input_tokens_seen": 163162224, + "router_z_loss_mlp": 0.37695312, + "step": 1963, + "time_per_iteration": 2.985558271408081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090796, + "balance_loss_mlp": 1.05236292, + "epoch": 0.3778376298576376, + "flos": 586417522176.0, + "grad_norm": 0.06598015027050642, + "language_loss": 0.80448836, + "learning_rate": 0.0007148946146280119, + "loss": 0.81539631, + "num_input_tokens_seen": 163237920, + "router_z_loss_mlp": 0.38427734, + "step": 1964, + "time_per_iteration": 2.784947395324707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01053975, + "balance_loss_mlp": 1.03938425, + "epoch": 0.3780300115429011, + "flos": 1396014759936.0, + "grad_norm": 0.018109037433210438, + "language_loss": 0.72192144, + "learning_rate": 0.000714613272269284, + "loss": 0.73246121, + "num_input_tokens_seen": 163455760, + "router_z_loss_mlp": 0.14550781, + "step": 1965, + "time_per_iteration": 4.85606050491333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052214, + "balance_loss_mlp": 1.03838599, + "epoch": 0.3782223932281647, + "flos": 1356935348736.0, + "grad_norm": 0.019183634636191996, + "language_loss": 0.75341946, + "learning_rate": 0.0007143318465902943, + "loss": 0.76394159, + "num_input_tokens_seen": 163678064, + "router_z_loss_mlp": 0.13867188, + "step": 1966, + "time_per_iteration": 4.946903467178345 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082797, + "balance_loss_mlp": 1.04517484, + "epoch": 0.37841477491342823, + "flos": 703811344896.0, + "grad_norm": 0.05921890511558738, + "language_loss": 0.83782387, + "learning_rate": 0.0007140503377003022, + "loss": 0.84865183, + "num_input_tokens_seen": 163764320, + "router_z_loss_mlp": 0.3762207, + "step": 1967, + "time_per_iteration": 2.984163761138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089165, + "balance_loss_mlp": 1.0504458, + "epoch": 0.3786071565986918, + "flos": 528856985088.0, + "grad_norm": 0.047303083725180994, + "language_loss": 0.84754062, + "learning_rate": 0.000713768745708599, + "loss": 0.85843223, + "num_input_tokens_seen": 163831808, + "router_z_loss_mlp": 0.38696289, + "step": 1968, + "time_per_iteration": 2.6251039505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086627, + "balance_loss_mlp": 1.04881418, + "epoch": 0.37879953828395535, + "flos": 992872802304.0, + "grad_norm": 0.053091209869219315, + "language_loss": 0.76740122, + "learning_rate": 0.0007134870707245085, + "loss": 0.7782675, + "num_input_tokens_seen": 163918128, + "router_z_loss_mlp": 0.37792969, + "step": 1969, + "time_per_iteration": 3.252840995788574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082313, + "balance_loss_mlp": 1.04435682, + "epoch": 0.37899191996921894, + "flos": 626358292992.0, + "grad_norm": 0.06088981396741891, + "language_loss": 0.8454808, + "learning_rate": 0.0007132053128573864, + "loss": 0.85630393, + "num_input_tokens_seen": 163987552, + "router_z_loss_mlp": 0.37915039, + "step": 1970, + "time_per_iteration": 2.739210844039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078955, + "balance_loss_mlp": 1.0417614, + "epoch": 0.37918430165448247, + "flos": 686005314048.0, + "grad_norm": 0.05972304110224919, + "language_loss": 0.83631253, + "learning_rate": 0.0007129234722166211, + "loss": 0.84710205, + "num_input_tokens_seen": 164063248, + "router_z_loss_mlp": 0.37182617, + "step": 1971, + "time_per_iteration": 2.814235210418701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086171, + "balance_loss_mlp": 1.05012178, + "epoch": 0.37937668333974606, + "flos": 475374721536.0, + "grad_norm": 0.05230765101952506, + "language_loss": 0.91063309, + "learning_rate": 0.0007126415489116328, + "loss": 0.92149478, + "num_input_tokens_seen": 164133776, + "router_z_loss_mlp": 0.3605957, + "step": 1972, + "time_per_iteration": 2.657435178756714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091568, + "balance_loss_mlp": 1.05413604, + "epoch": 0.37956906502500964, + "flos": 707271340032.0, + "grad_norm": 0.05210329015751025, + "language_loss": 0.81174934, + "learning_rate": 0.0007123595430518736, + "loss": 0.82266498, + "num_input_tokens_seen": 164206672, + "router_z_loss_mlp": 0.37402344, + "step": 1973, + "time_per_iteration": 2.832801103591919 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108628, + "balance_loss_mlp": 1.04856205, + "epoch": 0.3797614467102732, + "flos": 426421836288.0, + "grad_norm": 0.07403044475037865, + "language_loss": 0.8602494, + "learning_rate": 0.0007120774547468282, + "loss": 0.87111217, + "num_input_tokens_seen": 164271968, + "router_z_loss_mlp": 0.37695312, + "step": 1974, + "time_per_iteration": 2.523059844970703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087165, + "balance_loss_mlp": 1.05016232, + "epoch": 0.37995382839553676, + "flos": 481588941312.0, + "grad_norm": 0.05250431859571283, + "language_loss": 0.81228226, + "learning_rate": 0.0007117952841060128, + "loss": 0.82315391, + "num_input_tokens_seen": 164342800, + "router_z_loss_mlp": 0.36962891, + "step": 1975, + "time_per_iteration": 2.648947238922119 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080927, + "balance_loss_mlp": 1.04409158, + "epoch": 0.3801462100808003, + "flos": 560286056448.0, + "grad_norm": 0.0511194255012935, + "language_loss": 0.83466387, + "learning_rate": 0.0007115130312389756, + "loss": 0.84547317, + "num_input_tokens_seen": 164414928, + "router_z_loss_mlp": 0.3684082, + "step": 1976, + "time_per_iteration": 2.6648154258728027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086722, + "balance_loss_mlp": 1.0505538, + "epoch": 0.3803385917660639, + "flos": 464699524608.0, + "grad_norm": 0.06028169205400359, + "language_loss": 0.79143679, + "learning_rate": 0.0007112306962552973, + "loss": 0.80230403, + "num_input_tokens_seen": 164483312, + "router_z_loss_mlp": 0.36181641, + "step": 1977, + "time_per_iteration": 2.5715434551239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086468, + "balance_loss_mlp": 1.04934657, + "epoch": 0.3805309734513274, + "flos": 521614080000.0, + "grad_norm": 0.055719330197324175, + "language_loss": 0.8517288, + "learning_rate": 0.0007109482792645896, + "loss": 0.86259341, + "num_input_tokens_seen": 164555760, + "router_z_loss_mlp": 0.37084961, + "step": 1978, + "time_per_iteration": 2.6932663917541504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088226, + "balance_loss_mlp": 1.05222487, + "epoch": 0.380723355136591, + "flos": 591128782848.0, + "grad_norm": 0.06665517257748008, + "language_loss": 0.83491528, + "learning_rate": 0.0007106657803764969, + "loss": 0.84579754, + "num_input_tokens_seen": 164626768, + "router_z_loss_mlp": 0.36010742, + "step": 1979, + "time_per_iteration": 2.710658311843872 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079049, + "balance_loss_mlp": 1.04204643, + "epoch": 0.38091573682185453, + "flos": 622394910720.0, + "grad_norm": 0.05735071115872701, + "language_loss": 0.81648314, + "learning_rate": 0.0007103831997006948, + "loss": 0.82727367, + "num_input_tokens_seen": 164698016, + "router_z_loss_mlp": 0.36987305, + "step": 1980, + "time_per_iteration": 2.7589223384857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107835, + "balance_loss_mlp": 1.04168153, + "epoch": 0.3811081185071181, + "flos": 568716208128.0, + "grad_norm": 0.047165366669346453, + "language_loss": 0.85731214, + "learning_rate": 0.0007101005373468908, + "loss": 0.86809564, + "num_input_tokens_seen": 164780320, + "router_z_loss_mlp": 0.36669922, + "step": 1981, + "time_per_iteration": 2.85588002204895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077478, + "balance_loss_mlp": 1.04161954, + "epoch": 0.3813005001923817, + "flos": 584550798336.0, + "grad_norm": 0.055048826019740454, + "language_loss": 0.86394024, + "learning_rate": 0.0007098177934248242, + "loss": 0.87471503, + "num_input_tokens_seen": 164854400, + "router_z_loss_mlp": 0.35888672, + "step": 1982, + "time_per_iteration": 2.7226805686950684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077944, + "balance_loss_mlp": 1.04160953, + "epoch": 0.38149288187764524, + "flos": 621287649792.0, + "grad_norm": 0.056689743602043985, + "language_loss": 0.85823661, + "learning_rate": 0.0007095349680442661, + "loss": 0.86901605, + "num_input_tokens_seen": 164932896, + "router_z_loss_mlp": 0.36352539, + "step": 1983, + "time_per_iteration": 2.8454253673553467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075326, + "balance_loss_mlp": 1.03829932, + "epoch": 0.3816852635629088, + "flos": 570414196224.0, + "grad_norm": 0.07971741755252446, + "language_loss": 0.78927159, + "learning_rate": 0.0007092520613150188, + "loss": 0.80002487, + "num_input_tokens_seen": 165002896, + "router_z_loss_mlp": 0.37036133, + "step": 1984, + "time_per_iteration": 2.7279529571533203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081451, + "balance_loss_mlp": 1.04368556, + "epoch": 0.38187764524817236, + "flos": 565313029632.0, + "grad_norm": 0.06238598748372814, + "language_loss": 0.81304747, + "learning_rate": 0.0007089690733469165, + "loss": 0.82386196, + "num_input_tokens_seen": 165074704, + "router_z_loss_mlp": 0.37719727, + "step": 1985, + "time_per_iteration": 2.7343544960021973 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077201, + "balance_loss_mlp": 1.04115212, + "epoch": 0.38207002693343595, + "flos": 630932751360.0, + "grad_norm": 0.07832972313002672, + "language_loss": 0.82561398, + "learning_rate": 0.000708686004249825, + "loss": 0.83638602, + "num_input_tokens_seen": 165149136, + "router_z_loss_mlp": 0.36035156, + "step": 1986, + "time_per_iteration": 2.7691054344177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082228, + "balance_loss_mlp": 1.04572582, + "epoch": 0.3822624086186995, + "flos": 548507980800.0, + "grad_norm": 0.053849318526496194, + "language_loss": 0.9147824, + "learning_rate": 0.0007084028541336413, + "loss": 0.9256047, + "num_input_tokens_seen": 165220864, + "router_z_loss_mlp": 0.36499023, + "step": 1987, + "time_per_iteration": 2.7131056785583496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083994, + "balance_loss_mlp": 1.04753971, + "epoch": 0.38245479030396307, + "flos": 613571880960.0, + "grad_norm": 0.06787860515410171, + "language_loss": 0.86709088, + "learning_rate": 0.0007081196231082942, + "loss": 0.87793082, + "num_input_tokens_seen": 165301568, + "router_z_loss_mlp": 0.36450195, + "step": 1988, + "time_per_iteration": 2.7983548641204834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083168, + "balance_loss_mlp": 1.04621339, + "epoch": 0.38264717198922665, + "flos": 667787466240.0, + "grad_norm": 0.05230973877590939, + "language_loss": 0.80107033, + "learning_rate": 0.0007078363112837436, + "loss": 0.81190205, + "num_input_tokens_seen": 165373152, + "router_z_loss_mlp": 0.36938477, + "step": 1989, + "time_per_iteration": 2.8133270740509033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087048, + "balance_loss_mlp": 1.04935408, + "epoch": 0.3828395536744902, + "flos": 454521922560.0, + "grad_norm": 0.077904410907181, + "language_loss": 0.84988701, + "learning_rate": 0.000707552918769981, + "loss": 0.86075753, + "num_input_tokens_seen": 165439136, + "router_z_loss_mlp": 0.37646484, + "step": 1990, + "time_per_iteration": 2.5100817680358887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089992, + "balance_loss_mlp": 1.05213106, + "epoch": 0.3830319353597538, + "flos": 499191330816.0, + "grad_norm": 0.06242573245055077, + "language_loss": 0.83457661, + "learning_rate": 0.000707269445677029, + "loss": 0.84547657, + "num_input_tokens_seen": 165514624, + "router_z_loss_mlp": 0.37817383, + "step": 1991, + "time_per_iteration": 2.7737526893615723 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099946, + "balance_loss_mlp": 1.06327772, + "epoch": 0.3832243170450173, + "flos": 743787021312.0, + "grad_norm": 0.05437985066129539, + "language_loss": 0.84858984, + "learning_rate": 0.0007069858921149416, + "loss": 0.85958934, + "num_input_tokens_seen": 165594512, + "router_z_loss_mlp": 0.36694336, + "step": 1992, + "time_per_iteration": 2.9642581939697266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095684, + "balance_loss_mlp": 1.05868101, + "epoch": 0.3834166987302809, + "flos": 577937908224.0, + "grad_norm": 0.10762195872615073, + "language_loss": 0.85869837, + "learning_rate": 0.0007067022581938043, + "loss": 0.86965525, + "num_input_tokens_seen": 165673968, + "router_z_loss_mlp": 0.36987305, + "step": 1993, + "time_per_iteration": 2.805201292037964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101221, + "balance_loss_mlp": 1.06531525, + "epoch": 0.3836090804155444, + "flos": 536194432512.0, + "grad_norm": 0.06280477504596697, + "language_loss": 0.831635, + "learning_rate": 0.0007064185440237334, + "loss": 0.84264719, + "num_input_tokens_seen": 165747664, + "router_z_loss_mlp": 0.359375, + "step": 1994, + "time_per_iteration": 2.7297706604003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101916, + "balance_loss_mlp": 1.06527066, + "epoch": 0.383801462100808, + "flos": 601587191808.0, + "grad_norm": 0.05513764490979663, + "language_loss": 0.84278905, + "learning_rate": 0.0007061347497148764, + "loss": 0.85380822, + "num_input_tokens_seen": 165824624, + "router_z_loss_mlp": 0.36621094, + "step": 1995, + "time_per_iteration": 2.725632429122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094031, + "balance_loss_mlp": 1.05876923, + "epoch": 0.38399384378607154, + "flos": 572427896832.0, + "grad_norm": 0.06604776765413087, + "language_loss": 0.86282277, + "learning_rate": 0.0007058508753774122, + "loss": 0.87376308, + "num_input_tokens_seen": 165896304, + "router_z_loss_mlp": 0.35302734, + "step": 1996, + "time_per_iteration": 2.760045051574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092829, + "balance_loss_mlp": 1.05773425, + "epoch": 0.38418622547133513, + "flos": 536513117184.0, + "grad_norm": 0.058737109015633, + "language_loss": 0.86788458, + "learning_rate": 0.0007055669211215505, + "loss": 0.87881291, + "num_input_tokens_seen": 165961312, + "router_z_loss_mlp": 0.35131836, + "step": 1997, + "time_per_iteration": 2.6091408729553223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088772, + "balance_loss_mlp": 1.05238962, + "epoch": 0.3843786071565987, + "flos": 572673798144.0, + "grad_norm": 0.06483433205315106, + "language_loss": 0.77687544, + "learning_rate": 0.0007052828870575322, + "loss": 0.78776312, + "num_input_tokens_seen": 166028064, + "router_z_loss_mlp": 0.36376953, + "step": 1998, + "time_per_iteration": 2.671349048614502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109131, + "balance_loss_mlp": 1.0558331, + "epoch": 0.38457098884186225, + "flos": 728361275904.0, + "grad_norm": 0.05010154832824161, + "language_loss": 0.86955881, + "learning_rate": 0.0007049987732956291, + "loss": 0.88047194, + "num_input_tokens_seen": 166110272, + "router_z_loss_mlp": 0.35498047, + "step": 1999, + "time_per_iteration": 2.9918439388275146 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108718, + "balance_loss_mlp": 1.05189395, + "epoch": 0.38476337052712584, + "flos": 583123442688.0, + "grad_norm": 0.047224279388360366, + "language_loss": 0.82623643, + "learning_rate": 0.0007047145799461439, + "loss": 0.83710825, + "num_input_tokens_seen": 166193088, + "router_z_loss_mlp": 0.35327148, + "step": 2000, + "time_per_iteration": 2.84328293800354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092374, + "balance_loss_mlp": 1.05656374, + "epoch": 0.38495575221238937, + "flos": 552787075584.0, + "grad_norm": 0.05254385134155795, + "language_loss": 0.82269979, + "learning_rate": 0.00070443030711941, + "loss": 0.83362353, + "num_input_tokens_seen": 166271776, + "router_z_loss_mlp": 0.3581543, + "step": 2001, + "time_per_iteration": 2.753903865814209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092226, + "balance_loss_mlp": 1.0559628, + "epoch": 0.38514813389765296, + "flos": 654173190144.0, + "grad_norm": 0.05896823149323879, + "language_loss": 0.8241961, + "learning_rate": 0.0007041459549257924, + "loss": 0.83511841, + "num_input_tokens_seen": 166350000, + "router_z_loss_mlp": 0.36254883, + "step": 2002, + "time_per_iteration": 2.85963773727417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086715, + "balance_loss_mlp": 1.05030835, + "epoch": 0.3853405155829165, + "flos": 867715158528.0, + "grad_norm": 0.0671523306708724, + "language_loss": 0.78569824, + "learning_rate": 0.0007038615234756859, + "loss": 0.79656541, + "num_input_tokens_seen": 166434336, + "router_z_loss_mlp": 0.36425781, + "step": 2003, + "time_per_iteration": 3.1452226638793945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080594, + "balance_loss_mlp": 1.04440188, + "epoch": 0.3855328972681801, + "flos": 546164011008.0, + "grad_norm": 0.05736478292188374, + "language_loss": 0.83675313, + "learning_rate": 0.000703577012879517, + "loss": 0.84755898, + "num_input_tokens_seen": 166503952, + "router_z_loss_mlp": 0.36230469, + "step": 2004, + "time_per_iteration": 2.6308705806732178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075931, + "balance_loss_mlp": 1.04040706, + "epoch": 0.3857252789534436, + "flos": 533819939328.0, + "grad_norm": 0.0602394573591363, + "language_loss": 0.8843599, + "learning_rate": 0.0007032924232477423, + "loss": 0.89511919, + "num_input_tokens_seen": 166575168, + "router_z_loss_mlp": 0.35595703, + "step": 2005, + "time_per_iteration": 2.6188220977783203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079495, + "balance_loss_mlp": 1.04337406, + "epoch": 0.3859176606387072, + "flos": 491514849792.0, + "grad_norm": 0.055511202055775664, + "language_loss": 0.80448711, + "learning_rate": 0.0007030077546908493, + "loss": 0.81528199, + "num_input_tokens_seen": 166647552, + "router_z_loss_mlp": 0.36132812, + "step": 2006, + "time_per_iteration": 2.6309516429901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088067, + "balance_loss_mlp": 1.07609844, + "epoch": 0.3861100423239708, + "flos": 1486278955008.0, + "grad_norm": 0.032522163132150485, + "language_loss": 0.83064663, + "learning_rate": 0.0007027230073193561, + "loss": 0.84152722, + "num_input_tokens_seen": 166875088, + "router_z_loss_mlp": 0.11962891, + "step": 2007, + "time_per_iteration": 4.736604452133179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083286, + "balance_loss_mlp": 1.04816651, + "epoch": 0.3863024240092343, + "flos": 473493441024.0, + "grad_norm": 0.05514866045126494, + "language_loss": 0.79152983, + "learning_rate": 0.0007024381812438117, + "loss": 0.80236268, + "num_input_tokens_seen": 166939344, + "router_z_loss_mlp": 0.3515625, + "step": 2008, + "time_per_iteration": 2.5392396450042725 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108868, + "balance_loss_mlp": 1.05306053, + "epoch": 0.3864948056944979, + "flos": 716258723328.0, + "grad_norm": 0.059806412844581394, + "language_loss": 0.83199877, + "learning_rate": 0.0007021532765747951, + "loss": 0.84288561, + "num_input_tokens_seen": 167014992, + "router_z_loss_mlp": 0.35668945, + "step": 2009, + "time_per_iteration": 2.9926528930664062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087876, + "balance_loss_mlp": 1.0511353, + "epoch": 0.38668718737976143, + "flos": 727302067200.0, + "grad_norm": 0.05631620148302912, + "language_loss": 0.7933259, + "learning_rate": 0.0007018682934229162, + "loss": 0.8042047, + "num_input_tokens_seen": 167092096, + "router_z_loss_mlp": 0.36743164, + "step": 2010, + "time_per_iteration": 2.924781322479248 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087301, + "balance_loss_mlp": 1.05103779, + "epoch": 0.386879569065025, + "flos": 525218079744.0, + "grad_norm": 0.05794664731816873, + "language_loss": 0.82387936, + "learning_rate": 0.0007015832318988152, + "loss": 0.83475244, + "num_input_tokens_seen": 167162144, + "router_z_loss_mlp": 0.36328125, + "step": 2011, + "time_per_iteration": 2.668565511703491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032799, + "balance_loss_mlp": 1.02173615, + "epoch": 0.38707195075028855, + "flos": 1527036005376.0, + "grad_norm": 0.019732384975687786, + "language_loss": 0.73890078, + "learning_rate": 0.000701298092113163, + "loss": 0.74922872, + "num_input_tokens_seen": 167391536, + "router_z_loss_mlp": 0.11083984, + "step": 2012, + "time_per_iteration": 4.948081970214844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085105, + "balance_loss_mlp": 1.04886508, + "epoch": 0.38726433243555214, + "flos": 557045821440.0, + "grad_norm": 0.049164425244227684, + "language_loss": 0.84333575, + "learning_rate": 0.0007010128741766604, + "loss": 0.85418677, + "num_input_tokens_seen": 167466000, + "router_z_loss_mlp": 0.36230469, + "step": 2013, + "time_per_iteration": 2.7263684272766113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073123, + "balance_loss_mlp": 1.03695476, + "epoch": 0.38745671412081567, + "flos": 553431647232.0, + "grad_norm": 0.06787190277242791, + "language_loss": 0.84107876, + "learning_rate": 0.0007007275782000391, + "loss": 0.85181004, + "num_input_tokens_seen": 167536144, + "router_z_loss_mlp": 0.36181641, + "step": 2014, + "time_per_iteration": 2.6458756923675537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082824, + "balance_loss_mlp": 1.04679942, + "epoch": 0.38764909580607926, + "flos": 458175384576.0, + "grad_norm": 0.05583745089019265, + "language_loss": 0.85148585, + "learning_rate": 0.0007004422042940605, + "loss": 0.86231411, + "num_input_tokens_seen": 167600064, + "router_z_loss_mlp": 0.3605957, + "step": 2015, + "time_per_iteration": 2.5374679565429688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074881, + "balance_loss_mlp": 1.03847444, + "epoch": 0.38784147749134285, + "flos": 521973462528.0, + "grad_norm": 0.056017537147394686, + "language_loss": 0.89528251, + "learning_rate": 0.0007001567525695169, + "loss": 0.90603131, + "num_input_tokens_seen": 167666576, + "router_z_loss_mlp": 0.36425781, + "step": 2016, + "time_per_iteration": 2.5863571166992188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081449, + "balance_loss_mlp": 1.04504275, + "epoch": 0.3880338591766064, + "flos": 665696600064.0, + "grad_norm": 0.05583938490392423, + "language_loss": 0.839926, + "learning_rate": 0.0006998712231372303, + "loss": 0.85074055, + "num_input_tokens_seen": 167753296, + "router_z_loss_mlp": 0.36425781, + "step": 2017, + "time_per_iteration": 2.998652219772339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076964, + "balance_loss_mlp": 1.04103458, + "epoch": 0.38822624086186996, + "flos": 593660427264.0, + "grad_norm": 0.044278068469259586, + "language_loss": 0.86088806, + "learning_rate": 0.0006995856161080532, + "loss": 0.87165773, + "num_input_tokens_seen": 167834080, + "router_z_loss_mlp": 0.35961914, + "step": 2018, + "time_per_iteration": 2.870619297027588 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077822, + "balance_loss_mlp": 1.03972268, + "epoch": 0.3884186225471335, + "flos": 612256596480.0, + "grad_norm": 0.10653426783792587, + "language_loss": 0.8221643, + "learning_rate": 0.0006992999315928679, + "loss": 0.83294249, + "num_input_tokens_seen": 167912368, + "router_z_loss_mlp": 0.38061523, + "step": 2019, + "time_per_iteration": 2.7867438793182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074258, + "balance_loss_mlp": 1.03782773, + "epoch": 0.3886110042323971, + "flos": 606737820672.0, + "grad_norm": 0.05830260104080337, + "language_loss": 0.85476196, + "learning_rate": 0.0006990141697025871, + "loss": 0.8655045, + "num_input_tokens_seen": 167991968, + "router_z_loss_mlp": 0.36401367, + "step": 2020, + "time_per_iteration": 2.7722346782684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01008633, + "balance_loss_mlp": 0.99642587, + "epoch": 0.3888033859176606, + "flos": 1527289108992.0, + "grad_norm": 0.011259985776032525, + "language_loss": 0.76359642, + "learning_rate": 0.0006987283305481533, + "loss": 0.77368271, + "num_input_tokens_seen": 168212128, + "router_z_loss_mlp": 0.12207031, + "step": 2021, + "time_per_iteration": 4.71975302696228 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01069739, + "balance_loss_mlp": 1.03368998, + "epoch": 0.3889957676029242, + "flos": 692145340416.0, + "grad_norm": 0.08577921290538253, + "language_loss": 0.82040119, + "learning_rate": 0.0006984424142405392, + "loss": 0.83109856, + "num_input_tokens_seen": 168287440, + "router_z_loss_mlp": 0.36035156, + "step": 2022, + "time_per_iteration": 2.8003616333007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070636, + "balance_loss_mlp": 1.03413415, + "epoch": 0.3891881492881878, + "flos": 514937170944.0, + "grad_norm": 0.06357614890279897, + "language_loss": 0.81860286, + "learning_rate": 0.0006981564208907474, + "loss": 0.82930923, + "num_input_tokens_seen": 168354704, + "router_z_loss_mlp": 0.36499023, + "step": 2023, + "time_per_iteration": 2.581556558609009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01074791, + "balance_loss_mlp": 1.03933799, + "epoch": 0.3893805309734513, + "flos": 628770663936.0, + "grad_norm": 0.04985256691663517, + "language_loss": 0.90055227, + "learning_rate": 0.0006978703506098102, + "loss": 0.91130018, + "num_input_tokens_seen": 168424272, + "router_z_loss_mlp": 0.35498047, + "step": 2024, + "time_per_iteration": 2.7220654487609863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080786, + "balance_loss_mlp": 1.04497564, + "epoch": 0.3895729126587149, + "flos": 543894234624.0, + "grad_norm": 0.06500254639996711, + "language_loss": 0.88078821, + "learning_rate": 0.00069758420350879, + "loss": 0.89159608, + "num_input_tokens_seen": 168488912, + "router_z_loss_mlp": 0.35839844, + "step": 2025, + "time_per_iteration": 2.6044023036956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086131, + "balance_loss_mlp": 1.05012965, + "epoch": 0.38976529434397844, + "flos": 617987778048.0, + "grad_norm": 0.06153368317516065, + "language_loss": 0.86008936, + "learning_rate": 0.000697297979698779, + "loss": 0.87095064, + "num_input_tokens_seen": 168563248, + "router_z_loss_mlp": 0.36010742, + "step": 2026, + "time_per_iteration": 2.7051053047180176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091671, + "balance_loss_mlp": 1.05628932, + "epoch": 0.38995767602924203, + "flos": 834518287872.0, + "grad_norm": 0.05732441037152358, + "language_loss": 0.83766049, + "learning_rate": 0.0006970116792908992, + "loss": 0.84857726, + "num_input_tokens_seen": 168648272, + "router_z_loss_mlp": 0.35400391, + "step": 2027, + "time_per_iteration": 3.086228847503662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096114, + "balance_loss_mlp": 1.06032705, + "epoch": 0.39015005771450556, + "flos": 541343651328.0, + "grad_norm": 0.060477391230123065, + "language_loss": 0.8159399, + "learning_rate": 0.000696725302396302, + "loss": 0.82690096, + "num_input_tokens_seen": 168721760, + "router_z_loss_mlp": 0.3581543, + "step": 2028, + "time_per_iteration": 2.6521902084350586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093769, + "balance_loss_mlp": 1.05867422, + "epoch": 0.39034243939976915, + "flos": 1007102536704.0, + "grad_norm": 0.04866281229524116, + "language_loss": 0.85781944, + "learning_rate": 0.0006964388491261692, + "loss": 0.86875713, + "num_input_tokens_seen": 168803664, + "router_z_loss_mlp": 0.35131836, + "step": 2029, + "time_per_iteration": 3.2338647842407227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099777, + "balance_loss_mlp": 1.06401408, + "epoch": 0.3905348210850327, + "flos": 678723121152.0, + "grad_norm": 0.05278281932643199, + "language_loss": 0.87335277, + "learning_rate": 0.0006961523195917114, + "loss": 0.88435054, + "num_input_tokens_seen": 168879184, + "router_z_loss_mlp": 0.35791016, + "step": 2030, + "time_per_iteration": 2.8414504528045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098277, + "balance_loss_mlp": 1.06256151, + "epoch": 0.39072720277029627, + "flos": 548606905344.0, + "grad_norm": 0.05643291477722073, + "language_loss": 0.77850938, + "learning_rate": 0.0006958657139041696, + "loss": 0.78949213, + "num_input_tokens_seen": 168957808, + "router_z_loss_mlp": 0.35742188, + "step": 2031, + "time_per_iteration": 2.7278060913085938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091715, + "balance_loss_mlp": 1.07807708, + "epoch": 0.39091958445555985, + "flos": 1546912401408.0, + "grad_norm": 0.03426807627657635, + "language_loss": 0.76712966, + "learning_rate": 0.0006955790321748136, + "loss": 0.77804685, + "num_input_tokens_seen": 169194416, + "router_z_loss_mlp": 0.13671875, + "step": 2032, + "time_per_iteration": 4.927444696426392 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099209, + "balance_loss_mlp": 1.06399429, + "epoch": 0.3911119661408234, + "flos": 503741058048.0, + "grad_norm": 0.050822615130563034, + "language_loss": 0.78371578, + "learning_rate": 0.0006952922745149434, + "loss": 0.79470789, + "num_input_tokens_seen": 169263552, + "router_z_loss_mlp": 0.35253906, + "step": 2033, + "time_per_iteration": 2.6730306148529053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095244, + "balance_loss_mlp": 1.05933857, + "epoch": 0.391304347826087, + "flos": 556967245824.0, + "grad_norm": 0.05118619150019999, + "language_loss": 0.87770367, + "learning_rate": 0.000695005441035888, + "loss": 0.88865614, + "num_input_tokens_seen": 169333696, + "router_z_loss_mlp": 0.359375, + "step": 2034, + "time_per_iteration": 2.6585283279418945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038632, + "balance_loss_mlp": 1.02461255, + "epoch": 0.3914967295113505, + "flos": 1499309858304.0, + "grad_norm": 0.00946210886057752, + "language_loss": 0.73723435, + "learning_rate": 0.0006947185318490064, + "loss": 0.74762058, + "num_input_tokens_seen": 169556416, + "router_z_loss_mlp": 0.140625, + "step": 2035, + "time_per_iteration": 4.8464648723602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087203, + "balance_loss_mlp": 1.05182171, + "epoch": 0.3916891111966141, + "flos": 706715518464.0, + "grad_norm": 0.07007748344060821, + "language_loss": 0.81416976, + "learning_rate": 0.0006944315470656863, + "loss": 0.82504177, + "num_input_tokens_seen": 169643312, + "router_z_loss_mlp": 0.35424805, + "step": 2036, + "time_per_iteration": 2.9289584159851074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010868, + "balance_loss_mlp": 1.05051255, + "epoch": 0.3918814928818776, + "flos": 556085537280.0, + "grad_norm": 0.05570869743183256, + "language_loss": 0.91007531, + "learning_rate": 0.000694144486797345, + "loss": 0.92094326, + "num_input_tokens_seen": 169712560, + "router_z_loss_mlp": 0.36303711, + "step": 2037, + "time_per_iteration": 0.0013861656188964844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01043662, + "balance_loss_mlp": 1.02954793, + "epoch": 0.3920738745671412, + "flos": 1537845032448.0, + "grad_norm": 0.018656318140729232, + "language_loss": 0.79520434, + "learning_rate": 0.0006938573511554296, + "loss": 0.80564094, + "num_input_tokens_seen": 169914912, + "router_z_loss_mlp": 0.14160156, + "step": 2038, + "time_per_iteration": 4.6249003410339355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091153, + "balance_loss_mlp": 1.05586696, + "epoch": 0.39226625625240474, + "flos": 498594811392.0, + "grad_norm": 0.06764177247916761, + "language_loss": 0.89479941, + "learning_rate": 0.0006935701402514156, + "loss": 0.90571094, + "num_input_tokens_seen": 169978848, + "router_z_loss_mlp": 0.35327148, + "step": 2039, + "time_per_iteration": 2.535269260406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_mlp": 1.01963174, + "epoch": 0.39245863793766833, + "flos": 1346465203200.0, + "grad_norm": 0.017448285120256823, + "language_loss": 0.73034894, + "learning_rate": 0.0006932828541968083, + "loss": 0.7406826, + "num_input_tokens_seen": 170211488, + "router_z_loss_mlp": 0.13769531, + "step": 2040, + "time_per_iteration": 4.913180589675903 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086444, + "balance_loss_mlp": 1.05072939, + "epoch": 0.3926510196229319, + "flos": 1345614474240.0, + "grad_norm": 0.055350347752650936, + "language_loss": 0.8453002, + "learning_rate": 0.0006929954931031422, + "loss": 0.85616457, + "num_input_tokens_seen": 170298528, + "router_z_loss_mlp": 0.35742188, + "step": 2041, + "time_per_iteration": 3.6796491146087646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081967, + "balance_loss_mlp": 1.04722965, + "epoch": 0.39284340130819545, + "flos": 499333925376.0, + "grad_norm": 0.05434437059814268, + "language_loss": 0.88856936, + "learning_rate": 0.0006927080570819805, + "loss": 0.89938903, + "num_input_tokens_seen": 170365680, + "router_z_loss_mlp": 0.34790039, + "step": 2042, + "time_per_iteration": 2.634052038192749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087578, + "balance_loss_mlp": 1.05217266, + "epoch": 0.39303578299345904, + "flos": 520077625344.0, + "grad_norm": 0.06468620716873735, + "language_loss": 0.80649555, + "learning_rate": 0.0006924205462449161, + "loss": 0.81737131, + "num_input_tokens_seen": 170432224, + "router_z_loss_mlp": 0.35473633, + "step": 2043, + "time_per_iteration": 2.6021780967712402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078315, + "balance_loss_mlp": 1.04288566, + "epoch": 0.39322816467872257, + "flos": 907529301504.0, + "grad_norm": 0.05516365318311268, + "language_loss": 0.82013571, + "learning_rate": 0.0006921329607035702, + "loss": 0.83091891, + "num_input_tokens_seen": 170517920, + "router_z_loss_mlp": 0.35473633, + "step": 2044, + "time_per_iteration": 3.2195992469787598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078665, + "balance_loss_mlp": 1.04473805, + "epoch": 0.39342054636398616, + "flos": 517330603008.0, + "grad_norm": 0.046703626280748714, + "language_loss": 0.88374329, + "learning_rate": 0.0006918453005695938, + "loss": 0.89452994, + "num_input_tokens_seen": 170589072, + "router_z_loss_mlp": 0.33959961, + "step": 2045, + "time_per_iteration": 2.6319282054901123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080714, + "balance_loss_mlp": 1.04435515, + "epoch": 0.3936129280492497, + "flos": 547646621184.0, + "grad_norm": 0.04434497339872072, + "language_loss": 0.8422206, + "learning_rate": 0.0006915575659546662, + "loss": 0.8530277, + "num_input_tokens_seen": 170657856, + "router_z_loss_mlp": 0.36401367, + "step": 2046, + "time_per_iteration": 2.648895263671875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081267, + "balance_loss_mlp": 1.04519427, + "epoch": 0.3938053097345133, + "flos": 525858269184.0, + "grad_norm": 0.0524234289418272, + "language_loss": 0.80648899, + "learning_rate": 0.0006912697569704959, + "loss": 0.81730163, + "num_input_tokens_seen": 170723696, + "router_z_loss_mlp": 0.36083984, + "step": 2047, + "time_per_iteration": 2.6330111026763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085798, + "balance_loss_mlp": 1.04934382, + "epoch": 0.39399769141977686, + "flos": 471390990336.0, + "grad_norm": 0.0542278175669412, + "language_loss": 0.86721706, + "learning_rate": 0.0006909818737288205, + "loss": 0.87807506, + "num_input_tokens_seen": 170789536, + "router_z_loss_mlp": 0.36450195, + "step": 2048, + "time_per_iteration": 2.537581205368042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090734, + "balance_loss_mlp": 1.05468488, + "epoch": 0.3941900731050404, + "flos": 501490220544.0, + "grad_norm": 0.056383256559611315, + "language_loss": 0.80660325, + "learning_rate": 0.000690693916341406, + "loss": 0.8175106, + "num_input_tokens_seen": 170859232, + "router_z_loss_mlp": 0.3605957, + "step": 2049, + "time_per_iteration": 2.622183084487915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096224, + "balance_loss_mlp": 1.05898309, + "epoch": 0.394382454790304, + "flos": 580577241600.0, + "grad_norm": 0.11468284139168772, + "language_loss": 0.82465422, + "learning_rate": 0.0006904058849200475, + "loss": 0.83561641, + "num_input_tokens_seen": 170931568, + "router_z_loss_mlp": 0.37255859, + "step": 2050, + "time_per_iteration": 2.7216436862945557 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087551, + "balance_loss_mlp": 1.05088186, + "epoch": 0.3945748364755675, + "flos": 513563659776.0, + "grad_norm": 0.05187056217607278, + "language_loss": 0.84988606, + "learning_rate": 0.0006901177795765683, + "loss": 0.86076152, + "num_input_tokens_seen": 170999856, + "router_z_loss_mlp": 0.36694336, + "step": 2051, + "time_per_iteration": 2.5725293159484863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079371, + "balance_loss_mlp": 1.04291642, + "epoch": 0.3947672181608311, + "flos": 593683748352.0, + "grad_norm": 0.0518129521666432, + "language_loss": 0.8131091, + "learning_rate": 0.0006898296004228213, + "loss": 0.82390279, + "num_input_tokens_seen": 171072320, + "router_z_loss_mlp": 0.36450195, + "step": 2052, + "time_per_iteration": 2.6969447135925293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01050959, + "balance_loss_mlp": 1.0379895, + "epoch": 0.39495959984609463, + "flos": 1546829443584.0, + "grad_norm": 0.029989620736742544, + "language_loss": 0.7812674, + "learning_rate": 0.0006895413475706873, + "loss": 0.79177701, + "num_input_tokens_seen": 171304128, + "router_z_loss_mlp": 0.12988281, + "step": 2053, + "time_per_iteration": 4.8486199378967285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078173, + "balance_loss_mlp": 1.04233825, + "epoch": 0.3951519815313582, + "flos": 496271190528.0, + "grad_norm": 0.06693197740080077, + "language_loss": 0.80026031, + "learning_rate": 0.0006892530211320763, + "loss": 0.81104207, + "num_input_tokens_seen": 171377392, + "router_z_loss_mlp": 0.35864258, + "step": 2054, + "time_per_iteration": 2.6731534004211426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082597, + "balance_loss_mlp": 1.04657161, + "epoch": 0.39534436321662175, + "flos": 530934704640.0, + "grad_norm": 0.06400198340094926, + "language_loss": 0.8367995, + "learning_rate": 0.000688964621218926, + "loss": 0.84762549, + "num_input_tokens_seen": 171447424, + "router_z_loss_mlp": 0.36010742, + "step": 2055, + "time_per_iteration": 2.623870611190796 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107818, + "balance_loss_mlp": 1.0422982, + "epoch": 0.39553674490188534, + "flos": 702224017920.0, + "grad_norm": 0.05929287076568038, + "language_loss": 0.80154717, + "learning_rate": 0.0006886761479432037, + "loss": 0.81232893, + "num_input_tokens_seen": 171519920, + "router_z_loss_mlp": 0.35864258, + "step": 2056, + "time_per_iteration": 2.810593366622925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088441, + "balance_loss_mlp": 1.05263042, + "epoch": 0.3957291265871489, + "flos": 409552768512.0, + "grad_norm": 0.05784227470554994, + "language_loss": 0.84645867, + "learning_rate": 0.0006883876014169045, + "loss": 0.85734308, + "num_input_tokens_seen": 171583856, + "router_z_loss_mlp": 0.3581543, + "step": 2057, + "time_per_iteration": 2.464358329772949 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088007, + "balance_loss_mlp": 1.05121863, + "epoch": 0.39592150827241246, + "flos": 618204566016.0, + "grad_norm": 0.05454135250908964, + "language_loss": 0.90161431, + "learning_rate": 0.000688098981752052, + "loss": 0.91249436, + "num_input_tokens_seen": 171656064, + "router_z_loss_mlp": 0.36791992, + "step": 2058, + "time_per_iteration": 2.742589235305786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094817, + "balance_loss_mlp": 1.05819607, + "epoch": 0.39611388995767605, + "flos": 820986969600.0, + "grad_norm": 0.05656267147709111, + "language_loss": 0.80105305, + "learning_rate": 0.0006878102890606982, + "loss": 0.81200117, + "num_input_tokens_seen": 171738800, + "router_z_loss_mlp": 0.36621094, + "step": 2059, + "time_per_iteration": 3.0725343227386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096787, + "balance_loss_mlp": 1.06018949, + "epoch": 0.3963062716429396, + "flos": 491977539072.0, + "grad_norm": 0.0710153527902746, + "language_loss": 0.81321216, + "learning_rate": 0.0006875215234549239, + "loss": 0.82418001, + "num_input_tokens_seen": 171803664, + "router_z_loss_mlp": 0.3659668, + "step": 2060, + "time_per_iteration": 2.542090654373169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097398, + "balance_loss_mlp": 1.06015694, + "epoch": 0.39649865332820317, + "flos": 584466430464.0, + "grad_norm": 0.08966956269211096, + "language_loss": 0.8554219, + "learning_rate": 0.0006872326850468376, + "loss": 0.86639589, + "num_input_tokens_seen": 171871968, + "router_z_loss_mlp": 0.37231445, + "step": 2061, + "time_per_iteration": 2.7194440364837646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010999, + "balance_loss_mlp": 1.06139588, + "epoch": 0.3966910350134667, + "flos": 458328153600.0, + "grad_norm": 0.05276871533818733, + "language_loss": 0.78609985, + "learning_rate": 0.0006869437739485762, + "loss": 0.79709888, + "num_input_tokens_seen": 171942368, + "router_z_loss_mlp": 0.38476562, + "step": 2062, + "time_per_iteration": 2.6032114028930664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089079, + "balance_loss_mlp": 1.05281568, + "epoch": 0.3968834166987303, + "flos": 508388299776.0, + "grad_norm": 0.05735909750828215, + "language_loss": 0.93035084, + "learning_rate": 0.0006866547902723053, + "loss": 0.94124162, + "num_input_tokens_seen": 172012336, + "router_z_loss_mlp": 0.36279297, + "step": 2063, + "time_per_iteration": 2.666294813156128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097295, + "balance_loss_mlp": 1.06110287, + "epoch": 0.3970757983839938, + "flos": 572349321216.0, + "grad_norm": 0.05819495660266105, + "language_loss": 0.79961425, + "learning_rate": 0.000686365734130218, + "loss": 0.81058717, + "num_input_tokens_seen": 172084640, + "router_z_loss_mlp": 0.36206055, + "step": 2064, + "time_per_iteration": 2.667443037033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093878, + "balance_loss_mlp": 1.05639839, + "epoch": 0.3972681800692574, + "flos": 481391092224.0, + "grad_norm": 0.051061390118103664, + "language_loss": 0.84029245, + "learning_rate": 0.000686076605634536, + "loss": 0.85123128, + "num_input_tokens_seen": 172152992, + "router_z_loss_mlp": 0.37475586, + "step": 2065, + "time_per_iteration": 2.605252981185913 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091284, + "balance_loss_mlp": 1.05406737, + "epoch": 0.397460561754521, + "flos": 487683887616.0, + "grad_norm": 0.060621892107923396, + "language_loss": 0.84327424, + "learning_rate": 0.0006857874048975088, + "loss": 0.85418713, + "num_input_tokens_seen": 172219312, + "router_z_loss_mlp": 0.37207031, + "step": 2066, + "time_per_iteration": 2.5779786109924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090658, + "balance_loss_mlp": 1.05329823, + "epoch": 0.3976529434397845, + "flos": 421768802304.0, + "grad_norm": 0.05929679602335689, + "language_loss": 0.86975348, + "learning_rate": 0.0006854981320314142, + "loss": 0.88066006, + "num_input_tokens_seen": 172282112, + "router_z_loss_mlp": 0.37353516, + "step": 2067, + "time_per_iteration": 2.4723403453826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082754, + "balance_loss_mlp": 1.04591799, + "epoch": 0.3978453251250481, + "flos": 545331764736.0, + "grad_norm": 0.058605050617464606, + "language_loss": 0.86758339, + "learning_rate": 0.0006852087871485579, + "loss": 0.87841094, + "num_input_tokens_seen": 172347872, + "router_z_loss_mlp": 0.36816406, + "step": 2068, + "time_per_iteration": 2.6007602214813232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082239, + "balance_loss_mlp": 1.04602289, + "epoch": 0.39803770681031164, + "flos": 650548841472.0, + "grad_norm": 0.06821675645960798, + "language_loss": 0.81689966, + "learning_rate": 0.0006849193703612735, + "loss": 0.82772201, + "num_input_tokens_seen": 172418560, + "router_z_loss_mlp": 0.36206055, + "step": 2069, + "time_per_iteration": 2.7661337852478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083569, + "balance_loss_mlp": 1.04661417, + "epoch": 0.39823008849557523, + "flos": 739734888960.0, + "grad_norm": 0.059947122372600754, + "language_loss": 0.77649361, + "learning_rate": 0.0006846298817819225, + "loss": 0.78732932, + "num_input_tokens_seen": 172497984, + "router_z_loss_mlp": 0.36987305, + "step": 2070, + "time_per_iteration": 2.9364843368530273 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091151, + "balance_loss_mlp": 1.05436325, + "epoch": 0.39842247018083876, + "flos": 384825337344.0, + "grad_norm": 0.0736862776590319, + "language_loss": 0.80732799, + "learning_rate": 0.0006843403215228945, + "loss": 0.81823957, + "num_input_tokens_seen": 172560112, + "router_z_loss_mlp": 0.36767578, + "step": 2071, + "time_per_iteration": 2.4597537517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078334, + "balance_loss_mlp": 1.04133189, + "epoch": 0.39861485186610235, + "flos": 533431443456.0, + "grad_norm": 0.052578385162892496, + "language_loss": 0.80366135, + "learning_rate": 0.0006840506896966065, + "loss": 0.81444472, + "num_input_tokens_seen": 172636192, + "router_z_loss_mlp": 0.36962891, + "step": 2072, + "time_per_iteration": 2.6826841831207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081243, + "balance_loss_mlp": 1.04397774, + "epoch": 0.39880723355136594, + "flos": 642834482688.0, + "grad_norm": 0.055481383737447196, + "language_loss": 0.82090193, + "learning_rate": 0.0006837609864155038, + "loss": 0.83171439, + "num_input_tokens_seen": 172715264, + "router_z_loss_mlp": 0.37255859, + "step": 2073, + "time_per_iteration": 2.8541154861450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075844, + "balance_loss_mlp": 1.0408442, + "epoch": 0.39899961523662947, + "flos": 515587534848.0, + "grad_norm": 0.07686004257588779, + "language_loss": 0.83464944, + "learning_rate": 0.0006834712117920592, + "loss": 0.84540784, + "num_input_tokens_seen": 172783456, + "router_z_loss_mlp": 0.3503418, + "step": 2074, + "time_per_iteration": 2.6023800373077393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107731, + "balance_loss_mlp": 1.04025948, + "epoch": 0.39919199692189306, + "flos": 464148085248.0, + "grad_norm": 0.0625246810856132, + "language_loss": 0.85794407, + "learning_rate": 0.0006831813659387729, + "loss": 0.86871719, + "num_input_tokens_seen": 172848928, + "router_z_loss_mlp": 0.37036133, + "step": 2075, + "time_per_iteration": 2.5916242599487305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071695, + "balance_loss_mlp": 1.0353837, + "epoch": 0.3993843786071566, + "flos": 531382837248.0, + "grad_norm": 0.05588277312371317, + "language_loss": 0.84130096, + "learning_rate": 0.0006828914489681733, + "loss": 0.852018, + "num_input_tokens_seen": 172921152, + "router_z_loss_mlp": 0.36303711, + "step": 2076, + "time_per_iteration": 2.7014079093933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078543, + "balance_loss_mlp": 1.04168355, + "epoch": 0.3995767602924202, + "flos": 503701770240.0, + "grad_norm": 0.05616101270921505, + "language_loss": 0.85284638, + "learning_rate": 0.0006826014609928162, + "loss": 0.86363184, + "num_input_tokens_seen": 172998864, + "router_z_loss_mlp": 0.36816406, + "step": 2077, + "time_per_iteration": 2.6714975833892822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089101, + "balance_loss_mlp": 1.07622623, + "epoch": 0.3997691419776837, + "flos": 1453780500480.0, + "grad_norm": 0.03492718818999835, + "language_loss": 0.83199388, + "learning_rate": 0.0006823114021252846, + "loss": 0.8428849, + "num_input_tokens_seen": 173219216, + "router_z_loss_mlp": 0.12890625, + "step": 2078, + "time_per_iteration": 4.8198041915893555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01075675, + "balance_loss_mlp": 1.03943551, + "epoch": 0.3999615236629473, + "flos": 530418170880.0, + "grad_norm": 0.06060184252075253, + "language_loss": 0.79984158, + "learning_rate": 0.0006820212724781896, + "loss": 0.81059831, + "num_input_tokens_seen": 173292000, + "router_z_loss_mlp": 0.36279297, + "step": 2079, + "time_per_iteration": 2.725576400756836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077383, + "balance_loss_mlp": 1.04209709, + "epoch": 0.4001539053482108, + "flos": 694823961600.0, + "grad_norm": 0.12722864956638674, + "language_loss": 0.83843565, + "learning_rate": 0.0006817310721641694, + "loss": 0.84920955, + "num_input_tokens_seen": 173365568, + "router_z_loss_mlp": 0.35302734, + "step": 2080, + "time_per_iteration": 2.808981418609619 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 173365568, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4716113890902016.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}