diff --git "a/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/trainer_state.json" "b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/trainer_state.json" new file mode 100644--- /dev/null +++ "b/sft_pretrain/Full_smoe_sharev3/checkpoint-1040/trainer_state.json" @@ -0,0 +1,15633 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2000769526741054, + "eval_steps": 500, + "global_step": 1040, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03964023, + "balance_loss_mlp": 3.01339984, + "epoch": 0.00019238168526356292, + "flos": 470464353792.0, + "grad_norm": 27.10233905437441, + "language_loss": 3.72295761, + "learning_rate": 0.0, + "loss": 2.48840261, + "num_input_tokens_seen": 67104, + "router_z_loss_mlp": 9.5, + "step": 1, + "time_per_iteration": 29.606513023376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01906453, + "balance_loss_mlp": 1.25642872, + "epoch": 0.00038476337052712584, + "flos": 504311436288.0, + "grad_norm": 2.874173750989579, + "language_loss": 1.79264998, + "learning_rate": 0.00013726078121135892, + "loss": 1.81171465, + "num_input_tokens_seen": 134080, + "router_z_loss_mlp": 6.5, + "step": 2, + "time_per_iteration": 2.7078208923339844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01915611, + "balance_loss_mlp": 1.2667315, + "epoch": 0.0005771450557906887, + "flos": 598869282816.0, + "grad_norm": 2.1141296462778643, + "language_loss": 1.61429811, + "learning_rate": 0.00021755319103969496, + "loss": 1.63345432, + "num_input_tokens_seen": 205152, + "router_z_loss_mlp": 6.48828125, + "step": 3, + "time_per_iteration": 3.010409116744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01917319, + "balance_loss_mlp": 1.26309848, + "epoch": 0.0007695267410542517, + "flos": 580133491200.0, + "grad_norm": 1.255159247360545, + "language_loss": 1.49202251, + "learning_rate": 0.00027452156242271784, + "loss": 1.51119578, + "num_input_tokens_seen": 269664, + "router_z_loss_mlp": 6.54296875, + "step": 4, + "time_per_iteration": 2.7161622047424316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0185163, + "balance_loss_mlp": 1.22144234, + "epoch": 0.0009619084263178145, + "flos": 485857861632.0, + "grad_norm": 4.267520959606063, + "language_loss": 1.57359505, + "learning_rate": 0.0003187096642208417, + "loss": 1.59211147, + "num_input_tokens_seen": 338560, + "router_z_loss_mlp": 6.296875, + "step": 5, + "time_per_iteration": 2.718417167663574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01828185, + "balance_loss_mlp": 1.21211123, + "epoch": 0.0011542901115813775, + "flos": 559744791552.0, + "grad_norm": 1.225349312557607, + "language_loss": 1.4752574, + "learning_rate": 0.0003548139722510539, + "loss": 1.49353933, + "num_input_tokens_seen": 410112, + "router_z_loss_mlp": 6.15234375, + "step": 6, + "time_per_iteration": 2.6827874183654785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01821666, + "balance_loss_mlp": 1.22428453, + "epoch": 0.0013466717968449403, + "flos": 533721014784.0, + "grad_norm": 0.5025899606895544, + "language_loss": 1.33846116, + "learning_rate": 0.00038533972973918044, + "loss": 1.35667801, + "num_input_tokens_seen": 477552, + "router_z_loss_mlp": 5.96875, + "step": 7, + "time_per_iteration": 2.6889517307281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01776667, + "balance_loss_mlp": 1.2090404, + "epoch": 0.0015390534821085034, + "flos": 492037175808.0, + "grad_norm": 0.1719820928967348, + "language_loss": 1.2814672, + "learning_rate": 0.0004117823436340768, + "loss": 1.29923391, + "num_input_tokens_seen": 549184, + "router_z_loss_mlp": 5.6875, + "step": 8, + "time_per_iteration": 2.7207248210906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0177577, + "balance_loss_mlp": 1.23217535, + "epoch": 0.0017314351673720662, + "flos": 564402207744.0, + "grad_norm": 0.6128716609675008, + "language_loss": 1.39861906, + "learning_rate": 0.00043510638207938993, + "loss": 1.41637683, + "num_input_tokens_seen": 622880, + "router_z_loss_mlp": 5.44140625, + "step": 9, + "time_per_iteration": 2.887538194656372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01823371, + "balance_loss_mlp": 1.31334615, + "epoch": 0.001923816852635629, + "flos": 593132308992.0, + "grad_norm": 0.480897383035181, + "language_loss": 1.25963569, + "learning_rate": 0.00045597044543220066, + "loss": 1.27786922, + "num_input_tokens_seen": 693584, + "router_z_loss_mlp": 5.09765625, + "step": 10, + "time_per_iteration": 2.7672832012176514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01930298, + "balance_loss_mlp": 1.44621277, + "epoch": 0.002116198537899192, + "flos": 609308752896.0, + "grad_norm": 0.21803247425844502, + "language_loss": 1.22959518, + "learning_rate": 0.00047484428652143135, + "loss": 1.24889803, + "num_input_tokens_seen": 774432, + "router_z_loss_mlp": 4.83203125, + "step": 11, + "time_per_iteration": 2.9771082401275635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02130152, + "balance_loss_mlp": 1.67772901, + "epoch": 0.002308580223162755, + "flos": 544869075456.0, + "grad_norm": 0.19847359144835577, + "language_loss": 1.28057694, + "learning_rate": 0.0004920747534624128, + "loss": 1.30187845, + "num_input_tokens_seen": 844304, + "router_z_loss_mlp": 4.52734375, + "step": 12, + "time_per_iteration": 2.6094090938568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02177014, + "balance_loss_mlp": 1.7512939, + "epoch": 0.002500961908426318, + "flos": 644458277376.0, + "grad_norm": 0.3126355826019607, + "language_loss": 1.29235363, + "learning_rate": 0.0005079252465375872, + "loss": 1.31412375, + "num_input_tokens_seen": 915104, + "router_z_loss_mlp": 4.265625, + "step": 13, + "time_per_iteration": 2.841792345046997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02141221, + "balance_loss_mlp": 1.74411082, + "epoch": 0.0026933435936898806, + "flos": 487605312000.0, + "grad_norm": 0.282411779716686, + "language_loss": 1.17459798, + "learning_rate": 0.0005226005109505393, + "loss": 1.19601011, + "num_input_tokens_seen": 982720, + "router_z_loss_mlp": 3.96875, + "step": 14, + "time_per_iteration": 2.597313165664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02024541, + "balance_loss_mlp": 1.65890288, + "epoch": 0.0028857252789534437, + "flos": 434368949760.0, + "grad_norm": 0.2583476739022616, + "language_loss": 1.22957516, + "learning_rate": 0.0005362628552605367, + "loss": 1.24982059, + "num_input_tokens_seen": 1050528, + "router_z_loss_mlp": 3.65234375, + "step": 15, + "time_per_iteration": 2.6388704776763916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01790575, + "balance_loss_mlp": 1.44687057, + "epoch": 0.0030781069642170067, + "flos": 596465676288.0, + "grad_norm": 0.18613747071639053, + "language_loss": 1.27631426, + "learning_rate": 0.0005490431248454357, + "loss": 1.29421997, + "num_input_tokens_seen": 1116512, + "router_z_loss_mlp": 3.44140625, + "step": 16, + "time_per_iteration": 2.708346128463745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01779165, + "balance_loss_mlp": 1.46941185, + "epoch": 0.0032704886494805694, + "flos": 1537360432128.0, + "grad_norm": 0.2733785965407311, + "language_loss": 0.75705111, + "learning_rate": 0.0005610483427624225, + "loss": 0.77484274, + "num_input_tokens_seen": 1351216, + "router_z_loss_mlp": 3.09375, + "step": 17, + "time_per_iteration": 6.916250705718994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01553778, + "balance_loss_mlp": 1.24955583, + "epoch": 0.0034628703347441324, + "flos": 473720403456.0, + "grad_norm": 0.11658431553946913, + "language_loss": 1.14468098, + "learning_rate": 0.0005723671632907488, + "loss": 1.16021872, + "num_input_tokens_seen": 1420512, + "router_z_loss_mlp": 3.03710938, + "step": 18, + "time_per_iteration": 2.7716212272644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01490625, + "balance_loss_mlp": 1.21005416, + "epoch": 0.0036552520200076955, + "flos": 448303320576.0, + "grad_norm": 0.11552730485963776, + "language_loss": 1.19723654, + "learning_rate": 0.0005830738490244919, + "loss": 1.21214283, + "num_input_tokens_seen": 1484976, + "router_z_loss_mlp": 2.80859375, + "step": 19, + "time_per_iteration": 2.6067557334899902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0141948, + "balance_loss_mlp": 1.16103387, + "epoch": 0.003847633705271258, + "flos": 635881148928.0, + "grad_norm": 0.11977740619668105, + "language_loss": 1.21676993, + "learning_rate": 0.0005932312266435596, + "loss": 1.23096466, + "num_input_tokens_seen": 1557392, + "router_z_loss_mlp": 2.58398438, + "step": 20, + "time_per_iteration": 2.8545703887939453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01364308, + "balance_loss_mlp": 1.13084817, + "epoch": 0.004040015390534821, + "flos": 589222771200.0, + "grad_norm": 0.09935322828728523, + "language_loss": 1.16681409, + "learning_rate": 0.0006028929207788754, + "loss": 1.18045723, + "num_input_tokens_seen": 1626064, + "router_z_loss_mlp": 2.33203125, + "step": 21, + "time_per_iteration": 2.7119524478912354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319718, + "balance_loss_mlp": 1.11038613, + "epoch": 0.004232397075798384, + "flos": 756253338624.0, + "grad_norm": 0.09023283304690737, + "language_loss": 1.20250762, + "learning_rate": 0.0006121050677327902, + "loss": 1.21570492, + "num_input_tokens_seen": 1696528, + "router_z_loss_mlp": 2.09667969, + "step": 22, + "time_per_iteration": 2.884739398956299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01304467, + "balance_loss_mlp": 1.1184051, + "epoch": 0.004424778761061947, + "flos": 526434439680.0, + "grad_norm": 0.08559602389751407, + "language_loss": 1.10067201, + "learning_rate": 0.0006209076479463684, + "loss": 1.1137166, + "num_input_tokens_seen": 1765936, + "router_z_loss_mlp": 1.85839844, + "step": 23, + "time_per_iteration": 2.6616718769073486 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275434, + "balance_loss_mlp": 1.10787356, + "epoch": 0.00461716044632551, + "flos": 547907079168.0, + "grad_norm": 0.07141137445072718, + "language_loss": 1.2012924, + "learning_rate": 0.0006293355346737718, + "loss": 1.21404672, + "num_input_tokens_seen": 1841632, + "router_z_loss_mlp": 1.67675781, + "step": 24, + "time_per_iteration": 2.7025952339172363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01252583, + "balance_loss_mlp": 1.10476315, + "epoch": 0.004809542131589073, + "flos": 567293234688.0, + "grad_norm": 0.08524381015789384, + "language_loss": 1.16738653, + "learning_rate": 0.0006374193284416834, + "loss": 1.17991233, + "num_input_tokens_seen": 1920256, + "router_z_loss_mlp": 1.47753906, + "step": 25, + "time_per_iteration": 2.827439069747925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223638, + "balance_loss_mlp": 1.0984205, + "epoch": 0.005001923816852636, + "flos": 470391418368.0, + "grad_norm": 0.08512374611478205, + "language_loss": 1.15399337, + "learning_rate": 0.0006451860277489461, + "loss": 1.16622972, + "num_input_tokens_seen": 1986528, + "router_z_loss_mlp": 1.25097656, + "step": 26, + "time_per_iteration": 2.6214864253997803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206141, + "balance_loss_mlp": 1.10009253, + "epoch": 0.005194305502116198, + "flos": 415283950080.0, + "grad_norm": 0.07774032731783902, + "language_loss": 1.23061514, + "learning_rate": 0.0006526595731190848, + "loss": 1.2426765, + "num_input_tokens_seen": 2048016, + "router_z_loss_mlp": 1.0625, + "step": 27, + "time_per_iteration": 2.5637125968933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117904, + "balance_loss_mlp": 1.09192181, + "epoch": 0.005386687187379761, + "flos": 628466535936.0, + "grad_norm": 0.05524077436438855, + "language_loss": 1.1626848, + "learning_rate": 0.0006598612921618983, + "loss": 1.17447519, + "num_input_tokens_seen": 2127664, + "router_z_loss_mlp": 0.87158203, + "step": 28, + "time_per_iteration": 2.8202784061431885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159441, + "balance_loss_mlp": 1.08772469, + "epoch": 0.005579068872643324, + "flos": 886100332032.0, + "grad_norm": 0.07386109802626846, + "language_loss": 1.08505416, + "learning_rate": 0.0006668102665011454, + "loss": 1.09664845, + "num_input_tokens_seen": 2213952, + "router_z_loss_mlp": 0.71728516, + "step": 29, + "time_per_iteration": 3.2254040241241455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154142, + "balance_loss_mlp": 1.09520459, + "epoch": 0.005771450557906887, + "flos": 547287238656.0, + "grad_norm": 0.0797557646441396, + "language_loss": 1.18077409, + "learning_rate": 0.0006735236364718957, + "loss": 1.19231534, + "num_input_tokens_seen": 2284736, + "router_z_loss_mlp": 0.58886719, + "step": 30, + "time_per_iteration": 2.6730945110321045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140737, + "balance_loss_mlp": 1.09384, + "epoch": 0.00596383224317045, + "flos": 531766950912.0, + "grad_norm": 0.060827451674393726, + "language_loss": 1.1687839, + "learning_rate": 0.0006800168558381346, + "loss": 1.18019128, + "num_input_tokens_seen": 2354384, + "router_z_loss_mlp": 0.46875, + "step": 31, + "time_per_iteration": 2.649216651916504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148736, + "balance_loss_mlp": 1.11166239, + "epoch": 0.0061562139284340135, + "flos": 588813926400.0, + "grad_norm": 0.10592463777190406, + "language_loss": 1.19211543, + "learning_rate": 0.0006863039060567947, + "loss": 1.20360279, + "num_input_tokens_seen": 2419440, + "router_z_loss_mlp": 0.37084961, + "step": 32, + "time_per_iteration": 2.6697018146514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132499, + "balance_loss_mlp": 1.10136151, + "epoch": 0.006348595613697576, + "flos": 617929551360.0, + "grad_norm": 0.09812744917576391, + "language_loss": 1.1217525, + "learning_rate": 0.0006923974775611263, + "loss": 1.13307738, + "num_input_tokens_seen": 2496368, + "router_z_loss_mlp": 0.3112793, + "step": 33, + "time_per_iteration": 2.770225763320923 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137532, + "balance_loss_mlp": 1.11146092, + "epoch": 0.006540977298961139, + "flos": 777564444672.0, + "grad_norm": 0.06513543096417564, + "language_loss": 1.08375585, + "learning_rate": 0.0006983091239737814, + "loss": 1.09513116, + "num_input_tokens_seen": 2573280, + "router_z_loss_mlp": 0.26086426, + "step": 34, + "time_per_iteration": 2.99418306350708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128276, + "balance_loss_mlp": 1.10578084, + "epoch": 0.006733358984224702, + "flos": 666837356544.0, + "grad_norm": 0.06344935516817307, + "language_loss": 1.07062221, + "learning_rate": 0.0007040493939600222, + "loss": 1.08190489, + "num_input_tokens_seen": 2647248, + "router_z_loss_mlp": 0.22497559, + "step": 35, + "time_per_iteration": 2.9126057624816895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119708, + "balance_loss_mlp": 1.09892988, + "epoch": 0.006925740669488265, + "flos": 564092287488.0, + "grad_norm": 0.06579143759664555, + "language_loss": 1.07960629, + "learning_rate": 0.0007096279445021078, + "loss": 1.09080338, + "num_input_tokens_seen": 2720736, + "router_z_loss_mlp": 0.20788574, + "step": 36, + "time_per_iteration": 2.7079102993011475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114855, + "balance_loss_mlp": 1.09574544, + "epoch": 0.007118122354751828, + "flos": 549583156224.0, + "grad_norm": 0.14799474820221378, + "language_loss": 1.14634764, + "learning_rate": 0.0007150536386503726, + "loss": 1.15749621, + "num_input_tokens_seen": 2800336, + "router_z_loss_mlp": 0.19104004, + "step": 37, + "time_per_iteration": 2.8290467262268066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104011, + "balance_loss_mlp": 1.08533084, + "epoch": 0.007310504040015391, + "flos": 702161409024.0, + "grad_norm": 0.2513092385422617, + "language_loss": 1.08396375, + "learning_rate": 0.0007203346302358509, + "loss": 1.09500384, + "num_input_tokens_seen": 2883184, + "router_z_loss_mlp": 0.18688965, + "step": 38, + "time_per_iteration": 2.961430311203003 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121274, + "balance_loss_mlp": 1.10231924, + "epoch": 0.007502885725278953, + "flos": 599022051840.0, + "grad_norm": 0.0999674626629785, + "language_loss": 1.11391926, + "learning_rate": 0.000725478437577282, + "loss": 1.12513208, + "num_input_tokens_seen": 2960736, + "router_z_loss_mlp": 0.18945312, + "step": 39, + "time_per_iteration": 2.742088556289673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146989, + "balance_loss_mlp": 1.12810588, + "epoch": 0.007695267410542516, + "flos": 560000867328.0, + "grad_norm": 0.3323772184023467, + "language_loss": 1.08355689, + "learning_rate": 0.0007304920078549186, + "loss": 1.09502685, + "num_input_tokens_seen": 3033472, + "router_z_loss_mlp": 0.18884277, + "step": 40, + "time_per_iteration": 2.66943621635437 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116486, + "balance_loss_mlp": 1.1452378, + "epoch": 0.007887649095806078, + "flos": 507906671616.0, + "grad_norm": 0.11539272036457353, + "language_loss": 1.09356606, + "learning_rate": 0.0007353817735343603, + "loss": 1.1052146, + "num_input_tokens_seen": 3107824, + "router_z_loss_mlp": 0.19604492, + "step": 41, + "time_per_iteration": 2.7052595615386963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132998, + "balance_loss_mlp": 1.11293542, + "epoch": 0.008080030781069641, + "flos": 503642133504.0, + "grad_norm": 0.12251683576194117, + "language_loss": 1.04851842, + "learning_rate": 0.0007401537019902344, + "loss": 1.05984843, + "num_input_tokens_seen": 3176528, + "router_z_loss_mlp": 0.20056152, + "step": 42, + "time_per_iteration": 2.590432643890381 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124507, + "balance_loss_mlp": 1.10198867, + "epoch": 0.008272412466333205, + "flos": 517764178944.0, + "grad_norm": 0.09393858903586973, + "language_loss": 1.08539796, + "learning_rate": 0.0007448133392900729, + "loss": 1.09664297, + "num_input_tokens_seen": 3254256, + "router_z_loss_mlp": 0.22521973, + "step": 43, + "time_per_iteration": 2.6619081497192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112544, + "balance_loss_mlp": 1.10156202, + "epoch": 0.008464794151596768, + "flos": 607673373696.0, + "grad_norm": 0.06822323064374927, + "language_loss": 1.03845203, + "learning_rate": 0.0007493658489441491, + "loss": 1.04970646, + "num_input_tokens_seen": 3340224, + "router_z_loss_mlp": 0.23864746, + "step": 44, + "time_per_iteration": 2.861008644104004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128905, + "balance_loss_mlp": 1.10477662, + "epoch": 0.00865717583686033, + "flos": 537661075968.0, + "grad_norm": 0.1413166066405165, + "language_loss": 1.08820629, + "learning_rate": 0.0007538160463002316, + "loss": 1.09949529, + "num_input_tokens_seen": 3409216, + "router_z_loss_mlp": 0.2409668, + "step": 45, + "time_per_iteration": 2.643458604812622 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115676, + "balance_loss_mlp": 1.13258433, + "epoch": 0.008849557522123894, + "flos": 507758284800.0, + "grad_norm": 0.08570115972640321, + "language_loss": 1.10720444, + "learning_rate": 0.0007581684291577274, + "loss": 1.11877203, + "num_input_tokens_seen": 3478352, + "router_z_loss_mlp": 0.24157715, + "step": 46, + "time_per_iteration": 2.5904788970947266 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145761, + "balance_loss_mlp": 1.12085772, + "epoch": 0.009041939207387457, + "flos": 625048800768.0, + "grad_norm": 0.06636849455276843, + "language_loss": 1.14156199, + "learning_rate": 0.0007624272050891776, + "loss": 1.15301955, + "num_input_tokens_seen": 3555616, + "router_z_loss_mlp": 0.24902344, + "step": 47, + "time_per_iteration": 2.782179594039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154374, + "balance_loss_mlp": 1.12759995, + "epoch": 0.00923432089265102, + "flos": 549124849152.0, + "grad_norm": 0.09356522507451794, + "language_loss": 1.04615343, + "learning_rate": 0.0007665963158851307, + "loss": 1.05769718, + "num_input_tokens_seen": 3634512, + "router_z_loss_mlp": 0.26806641, + "step": 48, + "time_per_iteration": 2.824540138244629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174661, + "balance_loss_mlp": 1.14738548, + "epoch": 0.009426702577914583, + "flos": 562202242560.0, + "grad_norm": 0.059100241584136314, + "language_loss": 1.12381458, + "learning_rate": 0.0007706794594783609, + "loss": 1.13556111, + "num_input_tokens_seen": 3708480, + "router_z_loss_mlp": 0.27270508, + "step": 49, + "time_per_iteration": 2.790757894515991 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192673, + "balance_loss_mlp": 1.16604137, + "epoch": 0.009619084263178146, + "flos": 616486228992.0, + "grad_norm": 0.08074806779925832, + "language_loss": 1.11280799, + "learning_rate": 0.0007746801096530423, + "loss": 1.12473488, + "num_input_tokens_seen": 3783472, + "router_z_loss_mlp": 0.2668457, + "step": 50, + "time_per_iteration": 2.7235305309295654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116178, + "balance_loss_mlp": 1.135149, + "epoch": 0.009811465948441709, + "flos": 541176325632.0, + "grad_norm": 0.06558886342971224, + "language_loss": 1.16576111, + "learning_rate": 0.0007786015338021173, + "loss": 1.17737889, + "num_input_tokens_seen": 3851360, + "router_z_loss_mlp": 0.26672363, + "step": 51, + "time_per_iteration": 2.6817519664764404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134628, + "balance_loss_mlp": 1.1085453, + "epoch": 0.010003847633705272, + "flos": 535608087552.0, + "grad_norm": 0.06210449580458492, + "language_loss": 1.08870959, + "learning_rate": 0.0007824468089603051, + "loss": 1.10005593, + "num_input_tokens_seen": 3923056, + "router_z_loss_mlp": 0.26098633, + "step": 52, + "time_per_iteration": 2.644577980041504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125522, + "balance_loss_mlp": 1.09910512, + "epoch": 0.010196229318968833, + "flos": 908867907072.0, + "grad_norm": 0.05864822926220488, + "language_loss": 1.07807887, + "learning_rate": 0.0007862188363098669, + "loss": 1.08933413, + "num_input_tokens_seen": 4004528, + "router_z_loss_mlp": 0.26428223, + "step": 53, + "time_per_iteration": 3.1450047492980957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126237, + "balance_loss_mlp": 1.10084558, + "epoch": 0.010388611004232396, + "flos": 585594040320.0, + "grad_norm": 0.07974065634267835, + "language_loss": 1.08295977, + "learning_rate": 0.0007899203543304438, + "loss": 1.09422219, + "num_input_tokens_seen": 4078704, + "router_z_loss_mlp": 0.25390625, + "step": 54, + "time_per_iteration": 2.6822280883789062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155972, + "balance_loss_mlp": 1.13315582, + "epoch": 0.01058099268949596, + "flos": 502233716736.0, + "grad_norm": 0.07014139109577967, + "language_loss": 1.22212756, + "learning_rate": 0.0007935539507422731, + "loss": 1.23368728, + "num_input_tokens_seen": 4143600, + "router_z_loss_mlp": 0.22814941, + "step": 55, + "time_per_iteration": 2.5841405391693115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0117516, + "balance_loss_mlp": 1.153512, + "epoch": 0.010773374374759523, + "flos": 544170659328.0, + "grad_norm": 0.07006342440897594, + "language_loss": 1.13914931, + "learning_rate": 0.0007971220733732573, + "loss": 1.15090084, + "num_input_tokens_seen": 4217904, + "router_z_loss_mlp": 0.21643066, + "step": 56, + "time_per_iteration": 2.697427988052368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01193099, + "balance_loss_mlp": 1.17267895, + "epoch": 0.010965756060023086, + "flos": 525874235904.0, + "grad_norm": 0.08125896119424647, + "language_loss": 1.0764755, + "learning_rate": 0.0008006270400641869, + "loss": 1.08840656, + "num_input_tokens_seen": 4293920, + "router_z_loss_mlp": 0.2043457, + "step": 57, + "time_per_iteration": 2.723154306411743 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01174019, + "balance_loss_mlp": 1.15412247, + "epoch": 0.011158137745286649, + "flos": 576653147136.0, + "grad_norm": 0.07485866075688756, + "language_loss": 1.09104013, + "learning_rate": 0.0008040710477125043, + "loss": 1.10278034, + "num_input_tokens_seen": 4370080, + "router_z_loss_mlp": 0.19897461, + "step": 58, + "time_per_iteration": 2.703186273574829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153983, + "balance_loss_mlp": 1.13440859, + "epoch": 0.011350519430550212, + "flos": 529024310784.0, + "grad_norm": 0.06764829366941465, + "language_loss": 1.09780312, + "learning_rate": 0.0008074561805429771, + "loss": 1.10934305, + "num_input_tokens_seen": 4439792, + "router_z_loss_mlp": 0.19567871, + "step": 59, + "time_per_iteration": 2.6111674308776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136624, + "balance_loss_mlp": 1.11676335, + "epoch": 0.011542901115813775, + "flos": 555608291328.0, + "grad_norm": 0.06986870516034673, + "language_loss": 1.08079648, + "learning_rate": 0.0008107844176832545, + "loss": 1.09216261, + "num_input_tokens_seen": 4510800, + "router_z_loss_mlp": 0.19848633, + "step": 60, + "time_per_iteration": 2.682687997817993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125651, + "balance_loss_mlp": 1.1056236, + "epoch": 0.011735282801077338, + "flos": 571826995200.0, + "grad_norm": 0.061548073586970495, + "language_loss": 1.09071934, + "learning_rate": 0.0008140576401132568, + "loss": 1.10197592, + "num_input_tokens_seen": 4581136, + "router_z_loss_mlp": 0.20019531, + "step": 61, + "time_per_iteration": 2.639394760131836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111743, + "balance_loss_mlp": 1.09838021, + "epoch": 0.0119276644863409, + "flos": 615309156864.0, + "grad_norm": 0.06273761556608791, + "language_loss": 1.10558033, + "learning_rate": 0.0008172776370494935, + "loss": 1.11675453, + "num_input_tokens_seen": 4650352, + "router_z_loss_mlp": 0.19030762, + "step": 62, + "time_per_iteration": 2.7110230922698975 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134294, + "balance_loss_mlp": 1.11483955, + "epoch": 0.012120046171604464, + "flos": 500835474432.0, + "grad_norm": 0.07391589684249159, + "language_loss": 1.17346644, + "learning_rate": 0.0008204461118185703, + "loss": 1.18480933, + "num_input_tokens_seen": 4716336, + "router_z_loss_mlp": 0.19445801, + "step": 63, + "time_per_iteration": 2.5490689277648926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142708, + "balance_loss_mlp": 1.12420678, + "epoch": 0.012312427856868027, + "flos": 473109327360.0, + "grad_norm": 0.05825974543220343, + "language_loss": 1.06081367, + "learning_rate": 0.0008235646872681536, + "loss": 1.07224083, + "num_input_tokens_seen": 4781648, + "router_z_loss_mlp": 0.18505859, + "step": 64, + "time_per_iteration": 2.5874247550964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139504, + "balance_loss_mlp": 1.12069249, + "epoch": 0.012504809542131588, + "flos": 538094651904.0, + "grad_norm": 0.1040066778051144, + "language_loss": 1.06503749, + "learning_rate": 0.0008266349107584288, + "loss": 1.07643247, + "num_input_tokens_seen": 4852320, + "router_z_loss_mlp": 0.18823242, + "step": 65, + "time_per_iteration": 2.678736925125122 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123492, + "balance_loss_mlp": 1.10500288, + "epoch": 0.012697191227395151, + "flos": 608450365440.0, + "grad_norm": 0.09066354406474254, + "language_loss": 1.09410381, + "learning_rate": 0.0008296582587724851, + "loss": 1.10533869, + "num_input_tokens_seen": 4922016, + "router_z_loss_mlp": 0.18481445, + "step": 66, + "time_per_iteration": 2.6937255859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121105, + "balance_loss_mlp": 1.10255599, + "epoch": 0.012889572912658714, + "flos": 767750607360.0, + "grad_norm": 0.11790618145169461, + "language_loss": 1.07982886, + "learning_rate": 0.0008326361411800136, + "loss": 1.0910399, + "num_input_tokens_seen": 5000128, + "router_z_loss_mlp": 0.1854248, + "step": 67, + "time_per_iteration": 2.9377663135528564 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096346, + "balance_loss_mlp": 1.07871521, + "epoch": 0.013081954597922277, + "flos": 533604561408.0, + "grad_norm": 0.09153807632987658, + "language_loss": 1.08335972, + "learning_rate": 0.0008355699051851403, + "loss": 1.09432316, + "num_input_tokens_seen": 5074512, + "router_z_loss_mlp": 0.17651367, + "step": 68, + "time_per_iteration": 2.7278473377227783 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104023, + "balance_loss_mlp": 1.0865227, + "epoch": 0.01327433628318584, + "flos": 572826567168.0, + "grad_norm": 0.08317322449907456, + "language_loss": 1.14837921, + "learning_rate": 0.0008384608389860635, + "loss": 1.15941942, + "num_input_tokens_seen": 5141856, + "router_z_loss_mlp": 0.1751709, + "step": 69, + "time_per_iteration": 2.7238211631774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111418, + "balance_loss_mlp": 1.09424019, + "epoch": 0.013466717968449404, + "flos": 497029243392.0, + "grad_norm": 0.08213812906773327, + "language_loss": 1.04970825, + "learning_rate": 0.000841310175171381, + "loss": 1.06082237, + "num_input_tokens_seen": 5209280, + "router_z_loss_mlp": 0.17199707, + "step": 70, + "time_per_iteration": 2.578726291656494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101987, + "balance_loss_mlp": 1.08526158, + "epoch": 0.013659099653712967, + "flos": 565234454016.0, + "grad_norm": 0.06358988870017376, + "language_loss": 1.03380442, + "learning_rate": 0.000844119093875517, + "loss": 1.04482436, + "num_input_tokens_seen": 5285424, + "router_z_loss_mlp": 0.16723633, + "step": 71, + "time_per_iteration": 2.692791223526001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103533, + "balance_loss_mlp": 1.08689094, + "epoch": 0.01385148133897653, + "flos": 573540950016.0, + "grad_norm": 0.07461407963015444, + "language_loss": 1.08098376, + "learning_rate": 0.0008468887257134666, + "loss": 1.09201908, + "num_input_tokens_seen": 5358624, + "router_z_loss_mlp": 0.16650391, + "step": 72, + "time_per_iteration": 2.6599459648132324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122889, + "balance_loss_mlp": 1.10587776, + "epoch": 0.014043863024240093, + "flos": 576539665920.0, + "grad_norm": 0.05931650266846123, + "language_loss": 1.10316896, + "learning_rate": 0.0008496201545131264, + "loss": 1.11439776, + "num_input_tokens_seen": 5429792, + "router_z_loss_mlp": 0.17028809, + "step": 73, + "time_per_iteration": 2.7093684673309326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126213, + "balance_loss_mlp": 1.10950017, + "epoch": 0.014236244709503656, + "flos": 938287660032.0, + "grad_norm": 0.060718352480344094, + "language_loss": 1.08902812, + "learning_rate": 0.0008523144198617317, + "loss": 1.1002903, + "num_input_tokens_seen": 5518608, + "router_z_loss_mlp": 0.16711426, + "step": 74, + "time_per_iteration": 3.1743276119232178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125614, + "balance_loss_mlp": 1.10876918, + "epoch": 0.014428626394767219, + "flos": 528231352320.0, + "grad_norm": 0.07198154728214846, + "language_loss": 1.08249164, + "learning_rate": 0.0008549725194813783, + "loss": 1.09374774, + "num_input_tokens_seen": 5590576, + "router_z_loss_mlp": 0.1685791, + "step": 75, + "time_per_iteration": 2.630387783050537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106727, + "balance_loss_mlp": 1.09047866, + "epoch": 0.014621008080030782, + "flos": 803371433472.0, + "grad_norm": 0.07553700512989577, + "language_loss": 1.06998253, + "learning_rate": 0.0008575954114472099, + "loss": 1.0810498, + "num_input_tokens_seen": 5674224, + "router_z_loss_mlp": 0.16247559, + "step": 76, + "time_per_iteration": 3.134385347366333 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109532, + "balance_loss_mlp": 1.0933075, + "epoch": 0.014813389765294343, + "flos": 696588788736.0, + "grad_norm": 0.053440596513601155, + "language_loss": 1.05069363, + "learning_rate": 0.0008601840162606118, + "loss": 1.06178904, + "num_input_tokens_seen": 5757648, + "router_z_loss_mlp": 0.16223145, + "step": 77, + "time_per_iteration": 3.039991855621338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123171, + "balance_loss_mlp": 1.10660076, + "epoch": 0.015005771450557906, + "flos": 596702813184.0, + "grad_norm": 0.07894951514499118, + "language_loss": 1.1143651, + "learning_rate": 0.000862739218788641, + "loss": 1.12559676, + "num_input_tokens_seen": 5837600, + "router_z_loss_mlp": 0.16577148, + "step": 78, + "time_per_iteration": 2.867741346359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113228, + "balance_loss_mlp": 1.11553121, + "epoch": 0.01519815313582147, + "flos": 549148170240.0, + "grad_norm": 0.0893413961860561, + "language_loss": 1.07743871, + "learning_rate": 0.0008652618700799138, + "loss": 1.08876157, + "num_input_tokens_seen": 5907248, + "router_z_loss_mlp": 0.16760254, + "step": 79, + "time_per_iteration": 2.675795555114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01160152, + "balance_loss_mlp": 1.14348662, + "epoch": 0.015390534821085032, + "flos": 430306642944.0, + "grad_norm": 0.06679936706529424, + "language_loss": 1.07125092, + "learning_rate": 0.0008677527890662774, + "loss": 1.08285248, + "num_input_tokens_seen": 5970864, + "router_z_loss_mlp": 0.16662598, + "step": 80, + "time_per_iteration": 2.4765963554382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196819, + "balance_loss_mlp": 1.17889023, + "epoch": 0.015582916506348595, + "flos": 523854743040.0, + "grad_norm": 0.12362960542988827, + "language_loss": 1.09903598, + "learning_rate": 0.0008702127641587799, + "loss": 1.11100423, + "num_input_tokens_seen": 6040800, + "router_z_loss_mlp": 0.17932129, + "step": 81, + "time_per_iteration": 2.636688470840454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180455, + "balance_loss_mlp": 1.16288388, + "epoch": 0.015775298191612157, + "flos": 575151598080.0, + "grad_norm": 0.08274533442322421, + "language_loss": 1.04032063, + "learning_rate": 0.0008726425547457192, + "loss": 1.05212522, + "num_input_tokens_seen": 6111840, + "router_z_loss_mlp": 0.17565918, + "step": 82, + "time_per_iteration": 2.765179395675659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157804, + "balance_loss_mlp": 1.14051914, + "epoch": 0.01596767987687572, + "flos": 610040664576.0, + "grad_norm": 0.07618339381967684, + "language_loss": 1.03921247, + "learning_rate": 0.0008750428925998964, + "loss": 1.05079055, + "num_input_tokens_seen": 6183872, + "router_z_loss_mlp": 0.1730957, + "step": 83, + "time_per_iteration": 2.7615418434143066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159673, + "balance_loss_mlp": 1.14280462, + "epoch": 0.016160061562139283, + "flos": 566864040960.0, + "grad_norm": 0.0706757922791228, + "language_loss": 1.09743476, + "learning_rate": 0.0008774144832015932, + "loss": 1.10903156, + "num_input_tokens_seen": 6255760, + "router_z_loss_mlp": 0.16882324, + "step": 84, + "time_per_iteration": 2.694364070892334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01699218, + "balance_loss_mlp": 1.68252861, + "epoch": 0.016352443247402846, + "flos": 1410557234688.0, + "grad_norm": 0.23342967148410274, + "language_loss": 0.74774313, + "learning_rate": 0.0008797580069832641, + "loss": 0.76473522, + "num_input_tokens_seen": 6472960, + "router_z_loss_mlp": 0.16699219, + "step": 85, + "time_per_iteration": 4.599137306213379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01212855, + "balance_loss_mlp": 1.19580793, + "epoch": 0.01654482493266641, + "flos": 730177127424.0, + "grad_norm": 0.09253845479208671, + "language_loss": 1.04518116, + "learning_rate": 0.0008820741205014318, + "loss": 1.05730963, + "num_input_tokens_seen": 6548912, + "router_z_loss_mlp": 0.1706543, + "step": 86, + "time_per_iteration": 2.8595266342163086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01246652, + "balance_loss_mlp": 1.22939014, + "epoch": 0.016737206617929972, + "flos": 536016932352.0, + "grad_norm": 0.10044068584300966, + "language_loss": 1.06437612, + "learning_rate": 0.0008843634575408404, + "loss": 1.07684278, + "num_input_tokens_seen": 6621520, + "router_z_loss_mlp": 0.17248535, + "step": 87, + "time_per_iteration": 2.690492630004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215448, + "balance_loss_mlp": 1.19887805, + "epoch": 0.016929588303193535, + "flos": 536706584064.0, + "grad_norm": 0.0661610487366718, + "language_loss": 1.07674646, + "learning_rate": 0.0008866266301555082, + "loss": 1.08890104, + "num_input_tokens_seen": 6698432, + "router_z_loss_mlp": 0.16577148, + "step": 88, + "time_per_iteration": 2.737339496612549 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203027, + "balance_loss_mlp": 1.18706512, + "epoch": 0.017121969988457098, + "flos": 526498458624.0, + "grad_norm": 0.07897226836222233, + "language_loss": 1.08543992, + "learning_rate": 0.0008888642296509615, + "loss": 1.09747016, + "num_input_tokens_seen": 6764336, + "router_z_loss_mlp": 0.1595459, + "step": 89, + "time_per_iteration": 2.576819658279419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01187655, + "balance_loss_mlp": 1.17131162, + "epoch": 0.01731435167372066, + "flos": 625304876544.0, + "grad_norm": 0.0740353605135553, + "language_loss": 1.13367987, + "learning_rate": 0.0008910768275115906, + "loss": 1.14555645, + "num_input_tokens_seen": 6839392, + "router_z_loss_mlp": 0.16345215, + "step": 90, + "time_per_iteration": 2.778571128845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173459, + "balance_loss_mlp": 1.15692425, + "epoch": 0.017506733358984224, + "flos": 496157709312.0, + "grad_norm": 0.07518713147028631, + "language_loss": 1.08794332, + "learning_rate": 0.0008932649762767675, + "loss": 1.0996778, + "num_input_tokens_seen": 6907344, + "router_z_loss_mlp": 0.16540527, + "step": 91, + "time_per_iteration": 2.5931665897369385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01185544, + "balance_loss_mlp": 1.16881919, + "epoch": 0.017699115044247787, + "flos": 745613047296.0, + "grad_norm": 0.07711429280558382, + "language_loss": 1.11576343, + "learning_rate": 0.0008954292103690864, + "loss": 1.12761879, + "num_input_tokens_seen": 6982464, + "router_z_loss_mlp": 0.1673584, + "step": 92, + "time_per_iteration": 2.9129488468170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01194769, + "balance_loss_mlp": 1.17854476, + "epoch": 0.01789149672951135, + "flos": 515257265664.0, + "grad_norm": 0.0669718610224715, + "language_loss": 1.1343056, + "learning_rate": 0.0008975700468778296, + "loss": 1.14625335, + "num_input_tokens_seen": 7049712, + "router_z_loss_mlp": 0.16223145, + "step": 93, + "time_per_iteration": 2.576620101928711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01216953, + "balance_loss_mlp": 1.20076382, + "epoch": 0.018083878414774913, + "flos": 585850116096.0, + "grad_norm": 0.11698648494194364, + "language_loss": 1.0652318, + "learning_rate": 0.0008996879863005366, + "loss": 1.07740128, + "num_input_tokens_seen": 7120288, + "router_z_loss_mlp": 0.16186523, + "step": 94, + "time_per_iteration": 2.6751108169555664 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217025, + "balance_loss_mlp": 1.2013253, + "epoch": 0.018276260100038477, + "flos": 497103436800.0, + "grad_norm": 0.08327491501556071, + "language_loss": 1.06208014, + "learning_rate": 0.0009017835132453337, + "loss": 1.07425046, + "num_input_tokens_seen": 7188896, + "router_z_loss_mlp": 0.15686035, + "step": 95, + "time_per_iteration": 2.5971803665161133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196463, + "balance_loss_mlp": 1.1804409, + "epoch": 0.01846864178530204, + "flos": 639765955584.0, + "grad_norm": 0.09756000368948786, + "language_loss": 1.06920743, + "learning_rate": 0.0009038570970964896, + "loss": 1.08117199, + "num_input_tokens_seen": 7259536, + "router_z_loss_mlp": 0.16027832, + "step": 96, + "time_per_iteration": 2.7428832054138184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01173361, + "balance_loss_mlp": 1.15723228, + "epoch": 0.018661023470565603, + "flos": 511411746816.0, + "grad_norm": 0.07053433913024812, + "language_loss": 1.04343212, + "learning_rate": 0.0009059091926454854, + "loss": 1.05516577, + "num_input_tokens_seen": 7326752, + "router_z_loss_mlp": 0.16125488, + "step": 97, + "time_per_iteration": 2.570509433746338 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01178324, + "balance_loss_mlp": 1.16246903, + "epoch": 0.018853405155829166, + "flos": 930710103552.0, + "grad_norm": 0.08767892767743933, + "language_loss": 1.03389072, + "learning_rate": 0.0009079402406897198, + "loss": 1.04567385, + "num_input_tokens_seen": 7417488, + "router_z_loss_mlp": 0.15844727, + "step": 98, + "time_per_iteration": 3.202298164367676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179075, + "balance_loss_mlp": 1.16296983, + "epoch": 0.01904578684109273, + "flos": 576209396736.0, + "grad_norm": 0.2639136557883628, + "language_loss": 1.0596242, + "learning_rate": 0.0009099506686008212, + "loss": 1.07141495, + "num_input_tokens_seen": 7493136, + "router_z_loss_mlp": 0.16101074, + "step": 99, + "time_per_iteration": 2.812368869781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139923, + "balance_loss_mlp": 1.12423468, + "epoch": 0.019238168526356292, + "flos": 558173431296.0, + "grad_norm": 0.12311670746354397, + "language_loss": 1.08180976, + "learning_rate": 0.0009119408908644013, + "loss": 1.09320903, + "num_input_tokens_seen": 7560896, + "router_z_loss_mlp": 0.15673828, + "step": 100, + "time_per_iteration": 2.7063775062561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150815, + "balance_loss_mlp": 1.13574743, + "epoch": 0.019430550211619855, + "flos": 723539506176.0, + "grad_norm": 0.12127606313133317, + "language_loss": 1.14121008, + "learning_rate": 0.0009139113095929519, + "loss": 1.15271831, + "num_input_tokens_seen": 7629040, + "router_z_loss_mlp": 0.15039062, + "step": 101, + "time_per_iteration": 2.840913772583008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01218173, + "balance_loss_mlp": 1.20243776, + "epoch": 0.019622931896883418, + "flos": 499235000832.0, + "grad_norm": 0.1104247345061639, + "language_loss": 1.0836457, + "learning_rate": 0.0009158623150134762, + "loss": 1.09582746, + "num_input_tokens_seen": 7694256, + "router_z_loss_mlp": 0.15722656, + "step": 102, + "time_per_iteration": 2.560464859008789 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01357908, + "balance_loss_mlp": 1.34204173, + "epoch": 0.01981531358214698, + "flos": 508916418048.0, + "grad_norm": 0.15164768975642337, + "language_loss": 1.07661259, + "learning_rate": 0.000917794285931332, + "loss": 1.0901916, + "num_input_tokens_seen": 7762256, + "router_z_loss_mlp": 0.15856934, + "step": 103, + "time_per_iteration": 2.6684353351593018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381196, + "balance_loss_mlp": 1.36572242, + "epoch": 0.020007695267410544, + "flos": 521087371776.0, + "grad_norm": 0.10342928287682196, + "language_loss": 0.9971087, + "learning_rate": 0.0009197075901716639, + "loss": 1.01092052, + "num_input_tokens_seen": 7834400, + "router_z_loss_mlp": 0.15454102, + "step": 104, + "time_per_iteration": 2.7250871658325195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01356986, + "balance_loss_mlp": 1.34017777, + "epoch": 0.020200076952674107, + "flos": 533013834240.0, + "grad_norm": 0.1824265866479698, + "language_loss": 1.09647703, + "learning_rate": 0.0009216025849997171, + "loss": 1.11004686, + "num_input_tokens_seen": 7911184, + "router_z_loss_mlp": 0.16809082, + "step": 105, + "time_per_iteration": 2.776764154434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01261961, + "balance_loss_mlp": 1.24583197, + "epoch": 0.020392458637937667, + "flos": 684430981632.0, + "grad_norm": 0.06376163654280764, + "language_loss": 1.0425086, + "learning_rate": 0.0009234796175212258, + "loss": 1.05512834, + "num_input_tokens_seen": 7985280, + "router_z_loss_mlp": 0.16125488, + "step": 106, + "time_per_iteration": 2.9174978733062744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01269614, + "balance_loss_mlp": 1.25201869, + "epoch": 0.02058484032320123, + "flos": 701791852032.0, + "grad_norm": 0.060044663360548714, + "language_loss": 1.08808422, + "learning_rate": 0.000925339025064007, + "loss": 1.10078037, + "num_input_tokens_seen": 8068320, + "router_z_loss_mlp": 0.17590332, + "step": 107, + "time_per_iteration": 2.975735902786255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324579, + "balance_loss_mlp": 1.30547023, + "epoch": 0.020777222008464793, + "flos": 638772175872.0, + "grad_norm": 0.12680512225677842, + "language_loss": 1.01262307, + "learning_rate": 0.0009271811355418027, + "loss": 1.02586877, + "num_input_tokens_seen": 8148144, + "router_z_loss_mlp": 0.19128418, + "step": 108, + "time_per_iteration": 2.8408150672912598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01306621, + "balance_loss_mlp": 1.28755951, + "epoch": 0.020969603693728356, + "flos": 681785856000.0, + "grad_norm": 0.06997483982989385, + "language_loss": 1.08551693, + "learning_rate": 0.0009290062678013548, + "loss": 1.09858322, + "num_input_tokens_seen": 8222256, + "router_z_loss_mlp": 0.19055176, + "step": 109, + "time_per_iteration": 2.869980812072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309468, + "balance_loss_mlp": 1.29159832, + "epoch": 0.02116198537899192, + "flos": 533140462080.0, + "grad_norm": 0.13190855435004306, + "language_loss": 1.06647623, + "learning_rate": 0.0009308147319536321, + "loss": 1.07957077, + "num_input_tokens_seen": 8292432, + "router_z_loss_mlp": 0.17895508, + "step": 110, + "time_per_iteration": 2.6270735263824463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130688, + "balance_loss_mlp": 1.29067969, + "epoch": 0.021354367064255482, + "flos": 717168135168.0, + "grad_norm": 0.10963649287068344, + "language_loss": 1.1282903, + "learning_rate": 0.0009326068296900676, + "loss": 1.14135909, + "num_input_tokens_seen": 8365024, + "router_z_loss_mlp": 0.1619873, + "step": 111, + "time_per_iteration": 2.8845341205596924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01388527, + "balance_loss_mlp": 1.37200487, + "epoch": 0.021546748749519045, + "flos": 519290459136.0, + "grad_norm": 0.12406482447985402, + "language_loss": 1.03902006, + "learning_rate": 0.0009343828545846161, + "loss": 1.05290532, + "num_input_tokens_seen": 8442448, + "router_z_loss_mlp": 0.16516113, + "step": 112, + "time_per_iteration": 2.8167102336883545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01548404, + "balance_loss_mlp": 1.53109479, + "epoch": 0.021739130434782608, + "flos": 504912337920.0, + "grad_norm": 0.2528517188051562, + "language_loss": 1.0722419, + "learning_rate": 0.0009361430923823841, + "loss": 1.08772588, + "num_input_tokens_seen": 8508992, + "router_z_loss_mlp": 0.1730957, + "step": 113, + "time_per_iteration": 2.664581060409546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01441472, + "balance_loss_mlp": 1.42576015, + "epoch": 0.02193151212004617, + "flos": 463251820032.0, + "grad_norm": 0.1910881492312462, + "language_loss": 1.11420846, + "learning_rate": 0.0009378878212755459, + "loss": 1.12862325, + "num_input_tokens_seen": 8574048, + "router_z_loss_mlp": 0.15710449, + "step": 114, + "time_per_iteration": 2.4851133823394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262203, + "balance_loss_mlp": 1.24767148, + "epoch": 0.022123893805309734, + "flos": 552008673792.0, + "grad_norm": 0.09004287588953173, + "language_loss": 1.0099957, + "learning_rate": 0.0009396173121672103, + "loss": 1.0226177, + "num_input_tokens_seen": 8647808, + "router_z_loss_mlp": 0.14538574, + "step": 115, + "time_per_iteration": 2.6535162925720215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01215709, + "balance_loss_mlp": 1.20165396, + "epoch": 0.022316275490573297, + "flos": 635920436736.0, + "grad_norm": 0.07849561533847389, + "language_loss": 1.07122314, + "learning_rate": 0.0009413318289238633, + "loss": 1.08338022, + "num_input_tokens_seen": 8719760, + "router_z_loss_mlp": 0.14050293, + "step": 116, + "time_per_iteration": 2.7836899757385254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01203544, + "balance_loss_mlp": 1.18965602, + "epoch": 0.02250865717583686, + "flos": 798535107072.0, + "grad_norm": 0.07099947506123377, + "language_loss": 0.98912275, + "learning_rate": 0.0009430316286169771, + "loss": 1.00115824, + "num_input_tokens_seen": 8798752, + "router_z_loss_mlp": 0.13891602, + "step": 117, + "time_per_iteration": 3.049468517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01263206, + "balance_loss_mlp": 1.24786401, + "epoch": 0.022701038861100423, + "flos": 455851763712.0, + "grad_norm": 0.18808502465815918, + "language_loss": 1.04843259, + "learning_rate": 0.0009447169617543361, + "loss": 1.06106472, + "num_input_tokens_seen": 8866848, + "router_z_loss_mlp": 0.15319824, + "step": 118, + "time_per_iteration": 2.5886504650115967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121023, + "balance_loss_mlp": 1.19557953, + "epoch": 0.022893420546363986, + "flos": 582812112384.0, + "grad_norm": 0.09179634719817005, + "language_loss": 1.11139297, + "learning_rate": 0.0009463880725016029, + "loss": 1.12349522, + "num_input_tokens_seen": 8935488, + "router_z_loss_mlp": 0.14648438, + "step": 119, + "time_per_iteration": 2.6861259937286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169264, + "balance_loss_mlp": 1.15572226, + "epoch": 0.02308580223162755, + "flos": 561010613760.0, + "grad_norm": 0.09164108943144146, + "language_loss": 1.05675769, + "learning_rate": 0.0009480451988946134, + "loss": 1.06845045, + "num_input_tokens_seen": 9015344, + "router_z_loss_mlp": 0.13549805, + "step": 120, + "time_per_iteration": 2.8075129985809326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217336, + "balance_loss_mlp": 1.2034359, + "epoch": 0.023278183916891113, + "flos": 770966111232.0, + "grad_norm": 0.1019945076921087, + "language_loss": 1.07486713, + "learning_rate": 0.0009496885730428627, + "loss": 1.08704054, + "num_input_tokens_seen": 9094672, + "router_z_loss_mlp": 0.13903809, + "step": 121, + "time_per_iteration": 3.0081264972686768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291544, + "balance_loss_mlp": 1.27698815, + "epoch": 0.023470565602154676, + "flos": 553111552512.0, + "grad_norm": 0.08478902086488087, + "language_loss": 1.05369067, + "learning_rate": 0.0009513184213246156, + "loss": 1.06660616, + "num_input_tokens_seen": 9160608, + "router_z_loss_mlp": 0.14550781, + "step": 122, + "time_per_iteration": 2.654902696609497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406128, + "balance_loss_mlp": 1.39054775, + "epoch": 0.02366294728741824, + "flos": 559744791552.0, + "grad_norm": 0.09837859270685317, + "language_loss": 1.09463692, + "learning_rate": 0.0009529349645740552, + "loss": 1.10869825, + "num_input_tokens_seen": 9228704, + "router_z_loss_mlp": 0.15563965, + "step": 123, + "time_per_iteration": 2.6837081909179688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01484693, + "balance_loss_mlp": 1.46961284, + "epoch": 0.0238553289726818, + "flos": 468313698816.0, + "grad_norm": 0.11388616458843728, + "language_loss": 1.07573724, + "learning_rate": 0.0009545384182608524, + "loss": 1.09058416, + "num_input_tokens_seen": 9294288, + "router_z_loss_mlp": 0.1505127, + "step": 124, + "time_per_iteration": 2.5069937705993652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01411359, + "balance_loss_mlp": 1.39688659, + "epoch": 0.024047710657945365, + "flos": 559763730432.0, + "grad_norm": 0.3429043048666504, + "language_loss": 1.05057025, + "learning_rate": 0.0009561289926625252, + "loss": 1.06468379, + "num_input_tokens_seen": 9368048, + "router_z_loss_mlp": 0.14465332, + "step": 125, + "time_per_iteration": 2.6802117824554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011688, + "balance_loss_mlp": 1.15507352, + "epoch": 0.024240092343208928, + "flos": 504528224256.0, + "grad_norm": 0.18048320350440872, + "language_loss": 1.09737623, + "learning_rate": 0.0009577068930299292, + "loss": 1.10906434, + "num_input_tokens_seen": 9434848, + "router_z_loss_mlp": 0.13739014, + "step": 126, + "time_per_iteration": 2.6096670627593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163735, + "balance_loss_mlp": 1.15040147, + "epoch": 0.02443247402847249, + "flos": 435516908544.0, + "grad_norm": 0.07278748671530755, + "language_loss": 1.05931616, + "learning_rate": 0.0009592723197462087, + "loss": 1.07095349, + "num_input_tokens_seen": 9504112, + "router_z_loss_mlp": 0.13360596, + "step": 127, + "time_per_iteration": 2.6409482955932617 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01248107, + "balance_loss_mlp": 1.23239577, + "epoch": 0.024624855713736054, + "flos": 683445966336.0, + "grad_norm": 0.0813490266373729, + "language_loss": 1.02871299, + "learning_rate": 0.0009608254684795125, + "loss": 1.04119396, + "num_input_tokens_seen": 9590032, + "router_z_loss_mlp": 0.15710449, + "step": 128, + "time_per_iteration": 2.940600872039795 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265693, + "balance_loss_mlp": 1.24772859, + "epoch": 0.024817237398999614, + "flos": 524721894912.0, + "grad_norm": 0.0804185451989367, + "language_loss": 1.06161952, + "learning_rate": 0.0009623665303297678, + "loss": 1.07427645, + "num_input_tokens_seen": 9663040, + "router_z_loss_mlp": 0.1796875, + "step": 129, + "time_per_iteration": 2.7088472843170166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256284, + "balance_loss_mlp": 1.23668599, + "epoch": 0.025009619084263177, + "flos": 655350262272.0, + "grad_norm": 0.12369480901617341, + "language_loss": 1.10218048, + "learning_rate": 0.0009638956919697878, + "loss": 1.11474347, + "num_input_tokens_seen": 9736544, + "router_z_loss_mlp": 0.19592285, + "step": 130, + "time_per_iteration": 2.857571840286255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01266475, + "balance_loss_mlp": 1.24420691, + "epoch": 0.02520200076952674, + "flos": 454187271168.0, + "grad_norm": 0.08293639348197612, + "language_loss": 1.02638018, + "learning_rate": 0.0009654131357809714, + "loss": 1.03904486, + "num_input_tokens_seen": 9804656, + "router_z_loss_mlp": 0.22253418, + "step": 131, + "time_per_iteration": 2.641470432281494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128644, + "balance_loss_mlp": 1.26142943, + "epoch": 0.025394382454790303, + "flos": 839427397632.0, + "grad_norm": 0.05741461740254168, + "language_loss": 1.11002767, + "learning_rate": 0.0009669190399838441, + "loss": 1.12289214, + "num_input_tokens_seen": 9888864, + "router_z_loss_mlp": 0.25036621, + "step": 132, + "time_per_iteration": 3.133596420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01302533, + "balance_loss_mlp": 1.27633083, + "epoch": 0.025586764140053866, + "flos": 580725628416.0, + "grad_norm": 0.06987664196058198, + "language_loss": 1.0413487, + "learning_rate": 0.0009684135787636724, + "loss": 1.05437398, + "num_input_tokens_seen": 9968208, + "router_z_loss_mlp": 0.26208496, + "step": 133, + "time_per_iteration": 2.7968075275421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01325396, + "balance_loss_mlp": 1.29710746, + "epoch": 0.02577914582531743, + "flos": 789893959680.0, + "grad_norm": 0.07551411578012862, + "language_loss": 1.07757604, + "learning_rate": 0.0009698969223913726, + "loss": 1.09083009, + "num_input_tokens_seen": 10049664, + "router_z_loss_mlp": 0.28283691, + "step": 134, + "time_per_iteration": 3.0058987140655518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131126, + "balance_loss_mlp": 1.28212547, + "epoch": 0.025971527510580992, + "flos": 594683320320.0, + "grad_norm": 0.0731546450398535, + "language_loss": 1.10457921, + "learning_rate": 0.0009713692373399265, + "loss": 1.11769176, + "num_input_tokens_seen": 10120096, + "router_z_loss_mlp": 0.29125977, + "step": 135, + "time_per_iteration": 2.6654229164123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02152319, + "balance_loss_mlp": 1.95700705, + "epoch": 0.026163909195844555, + "flos": 1576771522560.0, + "grad_norm": 0.26755932757436196, + "language_loss": 0.79456228, + "learning_rate": 0.0009728306863964993, + "loss": 0.81608546, + "num_input_tokens_seen": 10348976, + "router_z_loss_mlp": 1.953125, + "step": 136, + "time_per_iteration": 6.531313896179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01724331, + "balance_loss_mlp": 1.55266988, + "epoch": 0.026356290881108118, + "flos": 1501306030080.0, + "grad_norm": 0.1444935793983717, + "language_loss": 0.77811038, + "learning_rate": 0.0009742814287704512, + "loss": 0.79535371, + "num_input_tokens_seen": 10576512, + "router_z_loss_mlp": 1.71875, + "step": 137, + "time_per_iteration": 4.966995716094971 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01371776, + "balance_loss_mlp": 1.34284425, + "epoch": 0.02654867256637168, + "flos": 596841025536.0, + "grad_norm": 0.06823267419395149, + "language_loss": 1.03539467, + "learning_rate": 0.0009757216201974225, + "loss": 1.04911256, + "num_input_tokens_seen": 10659168, + "router_z_loss_mlp": 0.28918457, + "step": 138, + "time_per_iteration": 2.7901663780212402 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01396345, + "balance_loss_mlp": 1.36752045, + "epoch": 0.026741054251635244, + "flos": 544761386496.0, + "grad_norm": 0.08904352821745645, + "language_loss": 1.08793342, + "learning_rate": 0.0009771514130396581, + "loss": 1.10189688, + "num_input_tokens_seen": 10731584, + "router_z_loss_mlp": 0.28833008, + "step": 139, + "time_per_iteration": 2.664384603500366 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01410566, + "balance_loss_mlp": 1.38171697, + "epoch": 0.026933435936898807, + "flos": 506591387136.0, + "grad_norm": 0.09467843708761726, + "language_loss": 1.08393478, + "learning_rate": 0.00097857095638274, + "loss": 1.09804034, + "num_input_tokens_seen": 10799456, + "router_z_loss_mlp": 0.28833008, + "step": 140, + "time_per_iteration": 2.5600626468658447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01399161, + "balance_loss_mlp": 1.37263703, + "epoch": 0.02712581762216237, + "flos": 740513290752.0, + "grad_norm": 0.06303030428856128, + "language_loss": 0.99670362, + "learning_rate": 0.0009799803961288726, + "loss": 1.01069522, + "num_input_tokens_seen": 10886416, + "router_z_loss_mlp": 0.26538086, + "step": 141, + "time_per_iteration": 2.984253168106079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01354082, + "balance_loss_mlp": 1.33143175, + "epoch": 0.027318199307425933, + "flos": 848023464960.0, + "grad_norm": 0.06264638149228761, + "language_loss": 1.05898559, + "learning_rate": 0.000981379875086876, + "loss": 1.07252645, + "num_input_tokens_seen": 10966064, + "router_z_loss_mlp": 0.22644043, + "step": 142, + "time_per_iteration": 3.032597064971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01323808, + "balance_loss_mlp": 1.30553341, + "epoch": 0.027510580992689496, + "flos": 575288400384.0, + "grad_norm": 0.07028220907739285, + "language_loss": 1.01752293, + "learning_rate": 0.0009827695330590185, + "loss": 1.030761, + "num_input_tokens_seen": 11039712, + "router_z_loss_mlp": 0.18273926, + "step": 143, + "time_per_iteration": 2.626483678817749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01303402, + "balance_loss_mlp": 1.28557992, + "epoch": 0.02770296267795306, + "flos": 772079164416.0, + "grad_norm": 0.05744811954937285, + "language_loss": 1.00619161, + "learning_rate": 0.0009841495069248256, + "loss": 1.0192256, + "num_input_tokens_seen": 11123984, + "router_z_loss_mlp": 0.17822266, + "step": 144, + "time_per_iteration": 2.9495198726654053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01316023, + "balance_loss_mlp": 1.29916632, + "epoch": 0.027895344363216622, + "flos": 569123642880.0, + "grad_norm": 0.04968902291069247, + "language_loss": 0.9920603, + "learning_rate": 0.0009855199307219871, + "loss": 1.00522041, + "num_input_tokens_seen": 11192864, + "router_z_loss_mlp": 0.1685791, + "step": 145, + "time_per_iteration": 2.6407721042633057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130391, + "balance_loss_mlp": 1.28731608, + "epoch": 0.028087726048480186, + "flos": 547099564032.0, + "grad_norm": 0.10723696528856613, + "language_loss": 1.01566505, + "learning_rate": 0.0009868809357244854, + "loss": 1.02870417, + "num_input_tokens_seen": 11261760, + "router_z_loss_mlp": 0.16589355, + "step": 146, + "time_per_iteration": 2.6262452602386475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01287507, + "balance_loss_mlp": 1.27153277, + "epoch": 0.02828010773374375, + "flos": 524519663616.0, + "grad_norm": 0.06991830692152445, + "language_loss": 1.05632663, + "learning_rate": 0.0009882326505180556, + "loss": 1.06920183, + "num_input_tokens_seen": 11334736, + "router_z_loss_mlp": 0.15966797, + "step": 147, + "time_per_iteration": 2.6469435691833496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270213, + "balance_loss_mlp": 1.2534517, + "epoch": 0.02847248941900731, + "flos": 772108277760.0, + "grad_norm": 0.07309095407736986, + "language_loss": 1.04486537, + "learning_rate": 0.0009895752010730906, + "loss": 1.0575676, + "num_input_tokens_seen": 11409872, + "router_z_loss_mlp": 0.16748047, + "step": 148, + "time_per_iteration": 2.9457786083221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012724, + "balance_loss_mlp": 1.25667655, + "epoch": 0.028664871104270875, + "flos": 534150208512.0, + "grad_norm": 0.048334696317449924, + "language_loss": 1.10088921, + "learning_rate": 0.0009909087108150867, + "loss": 1.11361325, + "num_input_tokens_seen": 11481024, + "router_z_loss_mlp": 0.15710449, + "step": 149, + "time_per_iteration": 2.712559700012207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309133, + "balance_loss_mlp": 1.29286051, + "epoch": 0.028857252789534438, + "flos": 367557599232.0, + "grad_norm": 0.13115053493636905, + "language_loss": 1.11238122, + "learning_rate": 0.0009922333006927371, + "loss": 1.12547255, + "num_input_tokens_seen": 11544240, + "router_z_loss_mlp": 0.16247559, + "step": 150, + "time_per_iteration": 2.4607067108154297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01329212, + "balance_loss_mlp": 1.31257081, + "epoch": 0.029049634474798, + "flos": 515232534528.0, + "grad_norm": 0.06948512606819708, + "language_loss": 1.04613614, + "learning_rate": 0.0009935490892437632, + "loss": 1.05942833, + "num_input_tokens_seen": 11610416, + "router_z_loss_mlp": 0.16650391, + "step": 151, + "time_per_iteration": 2.5460238456726074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01309109, + "balance_loss_mlp": 1.29317045, + "epoch": 0.029242016160061564, + "flos": 587840495616.0, + "grad_norm": 0.11257287432569656, + "language_loss": 1.03097093, + "learning_rate": 0.0009948561926585687, + "loss": 1.04406202, + "num_input_tokens_seen": 11687488, + "router_z_loss_mlp": 0.15930176, + "step": 152, + "time_per_iteration": 2.753009557723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300362, + "balance_loss_mlp": 1.28555596, + "epoch": 0.029434397845325123, + "flos": 551816616960.0, + "grad_norm": 0.062223246716750634, + "language_loss": 1.06524086, + "learning_rate": 0.0009961547248418122, + "loss": 1.07824445, + "num_input_tokens_seen": 11754576, + "router_z_loss_mlp": 0.14807129, + "step": 153, + "time_per_iteration": 2.630092144012451 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01308008, + "balance_loss_mlp": 1.29357219, + "epoch": 0.029626779530588686, + "flos": 603221160960.0, + "grad_norm": 0.09420536563091944, + "language_loss": 1.03062868, + "learning_rate": 0.0009974447974719707, + "loss": 1.04370856, + "num_input_tokens_seen": 11831360, + "router_z_loss_mlp": 0.14440918, + "step": 154, + "time_per_iteration": 2.6962759494781494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01312448, + "balance_loss_mlp": 1.29745138, + "epoch": 0.02981916121585225, + "flos": 620808993792.0, + "grad_norm": 0.08558703297148447, + "language_loss": 1.04985213, + "learning_rate": 0.0009987265200589763, + "loss": 1.0629766, + "num_input_tokens_seen": 11902192, + "router_z_loss_mlp": 0.15002441, + "step": 155, + "time_per_iteration": 2.7059414386749268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295882, + "balance_loss_mlp": 1.28057528, + "epoch": 0.030011542901115813, + "flos": 661322962944.0, + "grad_norm": 0.09731995783752632, + "language_loss": 1.04436159, + "learning_rate": 0.001, + "loss": 1.05732036, + "num_input_tokens_seen": 11979088, + "router_z_loss_mlp": 0.1529541, + "step": 156, + "time_per_iteration": 2.856968641281128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262708, + "balance_loss_mlp": 1.24682927, + "epoch": 0.030203924586379376, + "flos": 651258842112.0, + "grad_norm": 0.05966927829613408, + "language_loss": 1.02520585, + "learning_rate": 0.0009999999029413921, + "loss": 1.03783274, + "num_input_tokens_seen": 12059200, + "router_z_loss_mlp": 0.15856934, + "step": 157, + "time_per_iteration": 2.851480722427368 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01268181, + "balance_loss_mlp": 1.25150406, + "epoch": 0.03039630627164294, + "flos": 531083091456.0, + "grad_norm": 0.1034311415514979, + "language_loss": 1.04085183, + "learning_rate": 0.0009999996117656068, + "loss": 1.05353379, + "num_input_tokens_seen": 12134944, + "router_z_loss_mlp": 0.16674805, + "step": 158, + "time_per_iteration": 2.707646369934082 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262524, + "balance_loss_mlp": 1.24747968, + "epoch": 0.030588687956906502, + "flos": 585914135040.0, + "grad_norm": 0.12050944658187299, + "language_loss": 0.97824669, + "learning_rate": 0.0009999991264727564, + "loss": 0.99087203, + "num_input_tokens_seen": 12207936, + "router_z_loss_mlp": 0.15039062, + "step": 159, + "time_per_iteration": 2.7575390338897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272116, + "balance_loss_mlp": 1.25716722, + "epoch": 0.030781069642170065, + "flos": 513026777088.0, + "grad_norm": 0.07020206521781955, + "language_loss": 1.08316755, + "learning_rate": 0.0009999984470630296, + "loss": 1.09588861, + "num_input_tokens_seen": 12273200, + "router_z_loss_mlp": 0.14929199, + "step": 160, + "time_per_iteration": 2.62310528755188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128559, + "balance_loss_mlp": 1.27058172, + "epoch": 0.030973451327433628, + "flos": 717766064640.0, + "grad_norm": 0.06068839125924313, + "language_loss": 0.96528012, + "learning_rate": 0.0009999975735366902, + "loss": 0.978136, + "num_input_tokens_seen": 12359600, + "router_z_loss_mlp": 0.15002441, + "step": 161, + "time_per_iteration": 3.0823376178741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01305752, + "balance_loss_mlp": 1.29055238, + "epoch": 0.03116583301269719, + "flos": 1109312133120.0, + "grad_norm": 0.09428930343360856, + "language_loss": 0.98546314, + "learning_rate": 0.0009999965058940775, + "loss": 0.99852067, + "num_input_tokens_seen": 12443936, + "router_z_loss_mlp": 0.1517334, + "step": 162, + "time_per_iteration": 3.486618995666504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01315996, + "balance_loss_mlp": 1.3010118, + "epoch": 0.031358214697960754, + "flos": 450676403712.0, + "grad_norm": 0.09976775191278689, + "language_loss": 1.04580116, + "learning_rate": 0.0009999952441356057, + "loss": 1.05896115, + "num_input_tokens_seen": 12507488, + "router_z_loss_mlp": 0.1496582, + "step": 163, + "time_per_iteration": 2.537173271179199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01300744, + "balance_loss_mlp": 1.28654623, + "epoch": 0.031550596383224314, + "flos": 1254701325312.0, + "grad_norm": 0.0838197011845512, + "language_loss": 1.05903006, + "learning_rate": 0.000999993788261765, + "loss": 1.07203746, + "num_input_tokens_seen": 12594096, + "router_z_loss_mlp": 0.14196777, + "step": 164, + "time_per_iteration": 3.5638957023620605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01270584, + "balance_loss_mlp": 1.25625503, + "epoch": 0.03174297806848788, + "flos": 667841310720.0, + "grad_norm": 0.068717417443618, + "language_loss": 1.0642612, + "learning_rate": 0.00099999213827312, + "loss": 1.076967, + "num_input_tokens_seen": 12669424, + "router_z_loss_mlp": 0.14343262, + "step": 165, + "time_per_iteration": 2.8084213733673096 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255587, + "balance_loss_mlp": 1.24152076, + "epoch": 0.03193535975375144, + "flos": 551033832960.0, + "grad_norm": 0.06892139424853191, + "language_loss": 1.0208962, + "learning_rate": 0.000999990294170312, + "loss": 1.03345203, + "num_input_tokens_seen": 12740080, + "router_z_loss_mlp": 0.14074707, + "step": 166, + "time_per_iteration": 2.6247787475585938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01259954, + "balance_loss_mlp": 1.24549401, + "epoch": 0.032127741439015006, + "flos": 543377700864.0, + "grad_norm": 0.08292396830811857, + "language_loss": 1.05774951, + "learning_rate": 0.0009999882559540566, + "loss": 1.07034898, + "num_input_tokens_seen": 12810576, + "router_z_loss_mlp": 0.14465332, + "step": 167, + "time_per_iteration": 2.654036283493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291491, + "balance_loss_mlp": 1.27790117, + "epoch": 0.032320123124278566, + "flos": 548104928256.0, + "grad_norm": 0.07217909902530589, + "language_loss": 1.02104354, + "learning_rate": 0.000999986023625145, + "loss": 1.03395844, + "num_input_tokens_seen": 12887904, + "router_z_loss_mlp": 0.13598633, + "step": 168, + "time_per_iteration": 2.696866750717163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.03738194, + "balance_loss_mlp": 3.61993837, + "epoch": 0.03251250480954213, + "flos": 1305156865536.0, + "grad_norm": 0.563981464368737, + "language_loss": 0.78924417, + "learning_rate": 0.0009999835971844441, + "loss": 0.82662606, + "num_input_tokens_seen": 13107344, + "router_z_loss_mlp": 1.1796875, + "step": 169, + "time_per_iteration": 4.971506834030151 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0134723, + "balance_loss_mlp": 1.33386648, + "epoch": 0.03270488649480569, + "flos": 560866609152.0, + "grad_norm": 0.12141219581883538, + "language_loss": 1.02540469, + "learning_rate": 0.0009999809766328958, + "loss": 1.03887701, + "num_input_tokens_seen": 13175552, + "router_z_loss_mlp": 0.13391113, + "step": 170, + "time_per_iteration": 2.646425724029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01355192, + "balance_loss_mlp": 1.34039843, + "epoch": 0.03289726818006926, + "flos": 482120031744.0, + "grad_norm": 0.08046017426621577, + "language_loss": 1.05186188, + "learning_rate": 0.0009999781619715177, + "loss": 1.06541371, + "num_input_tokens_seen": 13242384, + "router_z_loss_mlp": 0.14770508, + "step": 171, + "time_per_iteration": 2.535360336303711 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381569, + "balance_loss_mlp": 1.36640596, + "epoch": 0.03308964986533282, + "flos": 674355276288.0, + "grad_norm": 0.08789680193074563, + "language_loss": 1.04250002, + "learning_rate": 0.000999975153201402, + "loss": 1.05631578, + "num_input_tokens_seen": 13316160, + "router_z_loss_mlp": 0.15161133, + "step": 172, + "time_per_iteration": 2.8205513954162598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433883, + "balance_loss_mlp": 1.41711044, + "epoch": 0.033282031550596385, + "flos": 608937785856.0, + "grad_norm": 0.07610360898370483, + "language_loss": 1.02505267, + "learning_rate": 0.0009999719503237174, + "loss": 1.03939152, + "num_input_tokens_seen": 13387664, + "router_z_loss_mlp": 0.16760254, + "step": 173, + "time_per_iteration": 2.738676071166992 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01451195, + "balance_loss_mlp": 1.43315864, + "epoch": 0.033474413235859944, + "flos": 467801547264.0, + "grad_norm": 0.07270846083900323, + "language_loss": 1.111094, + "learning_rate": 0.0009999685533397073, + "loss": 1.12560594, + "num_input_tokens_seen": 13454528, + "router_z_loss_mlp": 0.18029785, + "step": 174, + "time_per_iteration": 2.5556905269622803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01429898, + "balance_loss_mlp": 1.41368508, + "epoch": 0.03366679492112351, + "flos": 579365263872.0, + "grad_norm": 0.09196642879711979, + "language_loss": 1.03199494, + "learning_rate": 0.00099996496225069, + "loss": 1.04629397, + "num_input_tokens_seen": 13522528, + "router_z_loss_mlp": 0.16210938, + "step": 175, + "time_per_iteration": 2.6806485652923584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01432234, + "balance_loss_mlp": 1.41513896, + "epoch": 0.03385917660638707, + "flos": 637378315776.0, + "grad_norm": 0.08705990667808558, + "language_loss": 1.05897307, + "learning_rate": 0.0009999611770580604, + "loss": 1.07329535, + "num_input_tokens_seen": 13601120, + "router_z_loss_mlp": 0.17102051, + "step": 176, + "time_per_iteration": 2.830826759338379 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415158, + "balance_loss_mlp": 1.39910054, + "epoch": 0.03405155829165064, + "flos": 441587123712.0, + "grad_norm": 0.08054669051038237, + "language_loss": 1.03868258, + "learning_rate": 0.0009999571977632876, + "loss": 1.05283427, + "num_input_tokens_seen": 13666384, + "router_z_loss_mlp": 0.16052246, + "step": 177, + "time_per_iteration": 2.623309850692749 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01463141, + "balance_loss_mlp": 1.44573641, + "epoch": 0.034243939976914196, + "flos": 466097766912.0, + "grad_norm": 0.08089290506220445, + "language_loss": 1.06928194, + "learning_rate": 0.0009999530243679166, + "loss": 1.08391333, + "num_input_tokens_seen": 13733968, + "router_z_loss_mlp": 0.17407227, + "step": 178, + "time_per_iteration": 2.545133113861084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01451423, + "balance_loss_mlp": 1.43560433, + "epoch": 0.03443632166217776, + "flos": 778919016960.0, + "grad_norm": 0.08468734735068614, + "language_loss": 1.01505899, + "learning_rate": 0.0009999486568735675, + "loss": 1.0295732, + "num_input_tokens_seen": 13818960, + "router_z_loss_mlp": 0.15808105, + "step": 179, + "time_per_iteration": 3.0384457111358643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01433641, + "balance_loss_mlp": 1.41778612, + "epoch": 0.03462870334744132, + "flos": 1263284246016.0, + "grad_norm": 0.06997324880309466, + "language_loss": 1.01388979, + "learning_rate": 0.0009999440952819362, + "loss": 1.02822614, + "num_input_tokens_seen": 13912448, + "router_z_loss_mlp": 0.15856934, + "step": 180, + "time_per_iteration": 3.6892786026000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01401308, + "balance_loss_mlp": 1.38610911, + "epoch": 0.03482108503270489, + "flos": 606899354112.0, + "grad_norm": 0.057831512038439566, + "language_loss": 1.02027512, + "learning_rate": 0.0009999393395947935, + "loss": 1.03428817, + "num_input_tokens_seen": 13990752, + "router_z_loss_mlp": 0.15185547, + "step": 181, + "time_per_iteration": 2.826353073120117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01381551, + "balance_loss_mlp": 1.36612535, + "epoch": 0.03501346671796845, + "flos": 538010284032.0, + "grad_norm": 0.05913415109875365, + "language_loss": 1.05361927, + "learning_rate": 0.0009999343898139858, + "loss": 1.06743479, + "num_input_tokens_seen": 14058608, + "router_z_loss_mlp": 0.1541748, + "step": 182, + "time_per_iteration": 2.593250036239624 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01359754, + "balance_loss_mlp": 1.33988214, + "epoch": 0.035205848403232015, + "flos": 518231250432.0, + "grad_norm": 0.05898920665253376, + "language_loss": 1.04308426, + "learning_rate": 0.0009999292459414348, + "loss": 1.05668187, + "num_input_tokens_seen": 14126656, + "router_z_loss_mlp": 0.19909668, + "step": 183, + "time_per_iteration": 2.565936326980591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01311064, + "balance_loss_mlp": 1.296103, + "epoch": 0.035398230088495575, + "flos": 472134486528.0, + "grad_norm": 0.06373248491183749, + "language_loss": 1.08499169, + "learning_rate": 0.0009999239079791374, + "loss": 1.09810233, + "num_input_tokens_seen": 14195840, + "router_z_loss_mlp": 0.14953613, + "step": 184, + "time_per_iteration": 2.552949905395508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130912, + "balance_loss_mlp": 1.29237127, + "epoch": 0.03559061177375914, + "flos": 511820591616.0, + "grad_norm": 0.056329932736213485, + "language_loss": 1.01337337, + "learning_rate": 0.0009999183759291659, + "loss": 1.02646446, + "num_input_tokens_seen": 14269936, + "router_z_loss_mlp": 0.16748047, + "step": 185, + "time_per_iteration": 2.741727113723755 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291511, + "balance_loss_mlp": 1.27575147, + "epoch": 0.0357829934590227, + "flos": 477146903040.0, + "grad_norm": 0.11224085577532149, + "language_loss": 1.03738213, + "learning_rate": 0.0009999126497936682, + "loss": 1.05029726, + "num_input_tokens_seen": 14334848, + "router_z_loss_mlp": 0.1574707, + "step": 186, + "time_per_iteration": 2.4901957511901855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291515, + "balance_loss_mlp": 1.27446783, + "epoch": 0.03597537514428627, + "flos": 644350588416.0, + "grad_norm": 0.06537030709871235, + "language_loss": 1.06735992, + "learning_rate": 0.0009999067295748676, + "loss": 1.08027506, + "num_input_tokens_seen": 14407888, + "router_z_loss_mlp": 0.1706543, + "step": 187, + "time_per_iteration": 2.7923052310943604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01327575, + "balance_loss_mlp": 1.30966997, + "epoch": 0.03616775682954983, + "flos": 580916275200.0, + "grad_norm": 0.06523062893181024, + "language_loss": 1.04418302, + "learning_rate": 0.000999900615275062, + "loss": 1.05745876, + "num_input_tokens_seen": 14479072, + "router_z_loss_mlp": 0.17919922, + "step": 188, + "time_per_iteration": 2.7248637676239014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295421, + "balance_loss_mlp": 1.27722955, + "epoch": 0.03636013851481339, + "flos": 382210735104.0, + "grad_norm": 0.08035209765807474, + "language_loss": 1.10347509, + "learning_rate": 0.0009998943068966256, + "loss": 1.11642933, + "num_input_tokens_seen": 14540944, + "router_z_loss_mlp": 0.18188477, + "step": 189, + "time_per_iteration": 2.429497480392456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01279097, + "balance_loss_mlp": 1.26120377, + "epoch": 0.03655252020007695, + "flos": 582954706944.0, + "grad_norm": 0.07380481555246936, + "language_loss": 1.0506779, + "learning_rate": 0.0009998878044420072, + "loss": 1.06346881, + "num_input_tokens_seen": 14611392, + "router_z_loss_mlp": 0.17907715, + "step": 190, + "time_per_iteration": 2.6878626346588135 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.012863, + "balance_loss_mlp": 1.26773953, + "epoch": 0.03674490188534051, + "flos": 471376433664.0, + "grad_norm": 0.10484442400689244, + "language_loss": 1.01223493, + "learning_rate": 0.0009998811079137318, + "loss": 1.02509785, + "num_input_tokens_seen": 14679776, + "router_z_loss_mlp": 0.18566895, + "step": 191, + "time_per_iteration": 2.561494827270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281775, + "balance_loss_mlp": 1.26645625, + "epoch": 0.03693728357060408, + "flos": 528113488896.0, + "grad_norm": 0.0609431296621874, + "language_loss": 1.01984763, + "learning_rate": 0.0009998742173143987, + "loss": 1.03266537, + "num_input_tokens_seen": 14749712, + "router_z_loss_mlp": 0.1529541, + "step": 192, + "time_per_iteration": 2.59798264503479 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336751, + "balance_loss_mlp": 1.32157528, + "epoch": 0.03712966525586764, + "flos": 798657352704.0, + "grad_norm": 0.10186248006293357, + "language_loss": 1.02005363, + "learning_rate": 0.0009998671326466833, + "loss": 1.03342128, + "num_input_tokens_seen": 14827136, + "router_z_loss_mlp": 0.15185547, + "step": 193, + "time_per_iteration": 2.9510865211486816 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01331057, + "balance_loss_mlp": 1.3157624, + "epoch": 0.037322046941131205, + "flos": 829628116992.0, + "grad_norm": 0.06375125184008373, + "language_loss": 1.02914846, + "learning_rate": 0.0009998598539133362, + "loss": 1.04245901, + "num_input_tokens_seen": 14902880, + "router_z_loss_mlp": 0.1529541, + "step": 194, + "time_per_iteration": 2.9981300830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01337882, + "balance_loss_mlp": 1.3235296, + "epoch": 0.037514428626394765, + "flos": 437460797952.0, + "grad_norm": 0.10181133305516413, + "language_loss": 1.03936744, + "learning_rate": 0.0009998523811171828, + "loss": 1.0527463, + "num_input_tokens_seen": 14967264, + "router_z_loss_mlp": 0.14379883, + "step": 195, + "time_per_iteration": 2.501542568206787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01296215, + "balance_loss_mlp": 1.28125429, + "epoch": 0.03770681031165833, + "flos": 511372459008.0, + "grad_norm": 0.09414845611868274, + "language_loss": 1.04584992, + "learning_rate": 0.0009998447142611248, + "loss": 1.05881214, + "num_input_tokens_seen": 15039104, + "router_z_loss_mlp": 0.14941406, + "step": 196, + "time_per_iteration": 2.6247317790985107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128144, + "balance_loss_mlp": 1.26702762, + "epoch": 0.03789919199692189, + "flos": 807102061056.0, + "grad_norm": 0.05831249070889761, + "language_loss": 0.97701526, + "learning_rate": 0.0009998368533481387, + "loss": 0.9898296, + "num_input_tokens_seen": 15124864, + "router_z_loss_mlp": 0.14422607, + "step": 197, + "time_per_iteration": 3.01912784576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294999, + "balance_loss_mlp": 1.27945375, + "epoch": 0.03809157368218546, + "flos": 690274234368.0, + "grad_norm": 0.06656848410147823, + "language_loss": 1.00630498, + "learning_rate": 0.0009998287983812762, + "loss": 1.01925504, + "num_input_tokens_seen": 15199680, + "router_z_loss_mlp": 0.15551758, + "step": 198, + "time_per_iteration": 2.8252804279327393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0135387, + "balance_loss_mlp": 1.33592904, + "epoch": 0.03828395536744902, + "flos": 517675428864.0, + "grad_norm": 0.06988401379713739, + "language_loss": 1.06386423, + "learning_rate": 0.0009998205493636646, + "loss": 1.07740283, + "num_input_tokens_seen": 15270176, + "router_z_loss_mlp": 0.17944336, + "step": 199, + "time_per_iteration": 2.649765729904175 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01339461, + "balance_loss_mlp": 1.32242572, + "epoch": 0.038476337052712584, + "flos": 581389138944.0, + "grad_norm": 0.07184113921580974, + "language_loss": 0.9925406, + "learning_rate": 0.0009998121062985063, + "loss": 1.00593519, + "num_input_tokens_seen": 15343168, + "router_z_loss_mlp": 0.17053223, + "step": 200, + "time_per_iteration": 2.6788320541381836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01328142, + "balance_loss_mlp": 1.31167912, + "epoch": 0.03866871873797614, + "flos": 576791359488.0, + "grad_norm": 0.059667024197710104, + "language_loss": 1.01260698, + "learning_rate": 0.0009998034691890794, + "loss": 1.02588844, + "num_input_tokens_seen": 15417328, + "router_z_loss_mlp": 0.16455078, + "step": 201, + "time_per_iteration": 2.753265380859375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01297644, + "balance_loss_mlp": 1.28249288, + "epoch": 0.03886110042323971, + "flos": 540472117248.0, + "grad_norm": 0.07302515973387386, + "language_loss": 1.05948424, + "learning_rate": 0.0009997946380387369, + "loss": 1.07246065, + "num_input_tokens_seen": 15489488, + "router_z_loss_mlp": 0.15136719, + "step": 202, + "time_per_iteration": 2.618546485900879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01262023, + "balance_loss_mlp": 1.24746776, + "epoch": 0.03905348210850327, + "flos": 717694843392.0, + "grad_norm": 0.0775452329378228, + "language_loss": 1.08266401, + "learning_rate": 0.0009997856128509076, + "loss": 1.09528422, + "num_input_tokens_seen": 15558944, + "router_z_loss_mlp": 0.14550781, + "step": 203, + "time_per_iteration": 2.8284859657287598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267878, + "balance_loss_mlp": 1.25321579, + "epoch": 0.039245863793766836, + "flos": 427268639232.0, + "grad_norm": 0.06664318613050589, + "language_loss": 1.02886617, + "learning_rate": 0.0009997763936290952, + "loss": 1.04154491, + "num_input_tokens_seen": 15625024, + "router_z_loss_mlp": 0.14660645, + "step": 204, + "time_per_iteration": 2.516263246536255 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0129892, + "balance_loss_mlp": 1.28264785, + "epoch": 0.039438245479030395, + "flos": 662804163072.0, + "grad_norm": 0.07463685050771204, + "language_loss": 1.0815413, + "learning_rate": 0.0009997669803768789, + "loss": 1.09453046, + "num_input_tokens_seen": 15697120, + "router_z_loss_mlp": 0.16271973, + "step": 205, + "time_per_iteration": 2.7576606273651123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01291456, + "balance_loss_mlp": 1.27812803, + "epoch": 0.03963062716429396, + "flos": 635063459328.0, + "grad_norm": 0.055878982250893716, + "language_loss": 1.03253651, + "learning_rate": 0.0009997573730979134, + "loss": 1.04545116, + "num_input_tokens_seen": 15768752, + "router_z_loss_mlp": 0.13342285, + "step": 206, + "time_per_iteration": 2.716325521469116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.07279634, + "balance_loss_mlp": 4.65512276, + "epoch": 0.03982300884955752, + "flos": 1417813286400.0, + "grad_norm": 0.533603848118922, + "language_loss": 0.79193199, + "learning_rate": 0.0009997475717959284, + "loss": 0.86472833, + "num_input_tokens_seen": 15980624, + "router_z_loss_mlp": 26.25, + "step": 207, + "time_per_iteration": 4.635821342468262 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0137482, + "balance_loss_mlp": 1.35964513, + "epoch": 0.04001539053482109, + "flos": 688769713152.0, + "grad_norm": 0.1040721574676452, + "language_loss": 1.02094078, + "learning_rate": 0.0009997375764747294, + "loss": 1.03468895, + "num_input_tokens_seen": 16067232, + "router_z_loss_mlp": 0.1517334, + "step": 208, + "time_per_iteration": 2.974442481994629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01415285, + "balance_loss_mlp": 1.40052748, + "epoch": 0.04020777222008465, + "flos": 533363042304.0, + "grad_norm": 0.08111266742266361, + "language_loss": 0.99458027, + "learning_rate": 0.0009997273871381967, + "loss": 1.00873303, + "num_input_tokens_seen": 16139808, + "router_z_loss_mlp": 0.14758301, + "step": 209, + "time_per_iteration": 2.6802144050598145 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01466201, + "balance_loss_mlp": 1.44989347, + "epoch": 0.040400153905348214, + "flos": 567661381632.0, + "grad_norm": 0.06875741115436663, + "language_loss": 1.05031717, + "learning_rate": 0.0009997170037902862, + "loss": 1.0649792, + "num_input_tokens_seen": 16210848, + "router_z_loss_mlp": 0.16308594, + "step": 210, + "time_per_iteration": 2.6975836753845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01531768, + "balance_loss_mlp": 1.51399446, + "epoch": 0.040592535590611774, + "flos": 713130559488.0, + "grad_norm": 0.07197690318934227, + "language_loss": 1.07202697, + "learning_rate": 0.0009997064264350292, + "loss": 1.08734465, + "num_input_tokens_seen": 16283984, + "router_z_loss_mlp": 0.17785645, + "step": 211, + "time_per_iteration": 2.836771011352539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01531925, + "balance_loss_mlp": 1.5154984, + "epoch": 0.04078491727587533, + "flos": 577824427008.0, + "grad_norm": 0.09120436996840299, + "language_loss": 1.0146966, + "learning_rate": 0.0009996956550765317, + "loss": 1.03001595, + "num_input_tokens_seen": 16353904, + "router_z_loss_mlp": 0.16430664, + "step": 212, + "time_per_iteration": 2.671292781829834 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01499293, + "balance_loss_mlp": 1.4817214, + "epoch": 0.0409772989611389, + "flos": 552033404928.0, + "grad_norm": 0.11449485477945152, + "language_loss": 0.96278083, + "learning_rate": 0.0009996846897189762, + "loss": 0.97777379, + "num_input_tokens_seen": 16425488, + "router_z_loss_mlp": 0.17565918, + "step": 213, + "time_per_iteration": 2.6231424808502197 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.014653, + "balance_loss_mlp": 1.44753814, + "epoch": 0.04116968064640246, + "flos": 555347833344.0, + "grad_norm": 0.09512793115916172, + "language_loss": 1.02356708, + "learning_rate": 0.0009996735303666193, + "loss": 1.03822017, + "num_input_tokens_seen": 16498016, + "router_z_loss_mlp": 0.1776123, + "step": 214, + "time_per_iteration": 2.6930177211761475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01477298, + "balance_loss_mlp": 1.46134758, + "epoch": 0.041362062331666026, + "flos": 578204158464.0, + "grad_norm": 0.09141123477091552, + "language_loss": 1.04750729, + "learning_rate": 0.0009996621770237937, + "loss": 1.06228042, + "num_input_tokens_seen": 16573744, + "router_z_loss_mlp": 0.15942383, + "step": 215, + "time_per_iteration": 2.7448923587799072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01578462, + "balance_loss_mlp": 1.56013966, + "epoch": 0.041554444016929586, + "flos": 611130396672.0, + "grad_norm": 0.10233552268903827, + "language_loss": 0.99822551, + "learning_rate": 0.0009996506296949073, + "loss": 1.01401007, + "num_input_tokens_seen": 16655344, + "router_z_loss_mlp": 0.18334961, + "step": 216, + "time_per_iteration": 2.8548526763916016 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01609008, + "balance_loss_mlp": 1.59156775, + "epoch": 0.04174682570219315, + "flos": 527857413120.0, + "grad_norm": 0.10522858499680945, + "language_loss": 0.99888742, + "learning_rate": 0.0009996388883844428, + "loss": 1.01497757, + "num_input_tokens_seen": 16726480, + "router_z_loss_mlp": 0.17456055, + "step": 217, + "time_per_iteration": 2.618546724319458 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01557164, + "balance_loss_mlp": 1.54124999, + "epoch": 0.04193920738745671, + "flos": 511258977792.0, + "grad_norm": 0.09341741551851517, + "language_loss": 1.03841758, + "learning_rate": 0.0009996269530969588, + "loss": 1.05398929, + "num_input_tokens_seen": 16792112, + "router_z_loss_mlp": 0.15905762, + "step": 218, + "time_per_iteration": 2.6204636096954346 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01525903, + "balance_loss_mlp": 1.50927377, + "epoch": 0.04213158907272028, + "flos": 571226093568.0, + "grad_norm": 0.09609660813155754, + "language_loss": 1.02944803, + "learning_rate": 0.0009996148238370888, + "loss": 1.04470706, + "num_input_tokens_seen": 16862960, + "router_z_loss_mlp": 0.16625977, + "step": 219, + "time_per_iteration": 2.7071943283081055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0150071, + "balance_loss_mlp": 1.48340106, + "epoch": 0.04232397075798384, + "flos": 963803667456.0, + "grad_norm": 0.05454565212769997, + "language_loss": 0.9941752, + "learning_rate": 0.0009996025006095421, + "loss": 1.00918233, + "num_input_tokens_seen": 16950416, + "router_z_loss_mlp": 0.1730957, + "step": 220, + "time_per_iteration": 3.3006374835968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.10944285, + "balance_loss_mlp": 6.84272289, + "epoch": 0.042516352443247404, + "flos": 1468814777856.0, + "grad_norm": 0.48497418398004566, + "language_loss": 0.77783144, + "learning_rate": 0.0009995899834191028, + "loss": 0.88727427, + "num_input_tokens_seen": 17180944, + "router_z_loss_mlp": 41.0, + "step": 221, + "time_per_iteration": 5.7136383056640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01601015, + "balance_loss_mlp": 1.58291924, + "epoch": 0.042708734128510964, + "flos": 654419091456.0, + "grad_norm": 0.10763646442297387, + "language_loss": 0.99765503, + "learning_rate": 0.0009995772722706307, + "loss": 1.0136652, + "num_input_tokens_seen": 17257792, + "router_z_loss_mlp": 0.1809082, + "step": 222, + "time_per_iteration": 2.8322792053222656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01658843, + "balance_loss_mlp": 1.63811278, + "epoch": 0.04290111581377453, + "flos": 431601578496.0, + "grad_norm": 0.16393394652444138, + "language_loss": 1.13557565, + "learning_rate": 0.0009995643671690604, + "loss": 1.1521641, + "num_input_tokens_seen": 17320288, + "router_z_loss_mlp": 0.20739746, + "step": 223, + "time_per_iteration": 2.470729351043701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0163871, + "balance_loss_mlp": 1.61504686, + "epoch": 0.04309349749903809, + "flos": 644379701760.0, + "grad_norm": 0.08733094203359489, + "language_loss": 1.00837708, + "learning_rate": 0.0009995512681194023, + "loss": 1.02476418, + "num_input_tokens_seen": 17396672, + "router_z_loss_mlp": 0.23632812, + "step": 224, + "time_per_iteration": 2.8274452686309814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01615568, + "balance_loss_mlp": 1.58755326, + "epoch": 0.04328587918430166, + "flos": 830861853696.0, + "grad_norm": 0.12001676841435771, + "language_loss": 0.98664522, + "learning_rate": 0.0009995379751267417, + "loss": 1.00280082, + "num_input_tokens_seen": 17488096, + "router_z_loss_mlp": 0.28027344, + "step": 225, + "time_per_iteration": 3.275660991668701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01617416, + "balance_loss_mlp": 1.58639741, + "epoch": 0.043478260869565216, + "flos": 524804852736.0, + "grad_norm": 0.1467276253632499, + "language_loss": 1.0007726, + "learning_rate": 0.0009995244881962398, + "loss": 1.01694679, + "num_input_tokens_seen": 17557632, + "router_z_loss_mlp": 0.30981445, + "step": 226, + "time_per_iteration": 2.6300203800201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01601732, + "balance_loss_mlp": 1.56787658, + "epoch": 0.04367064255482878, + "flos": 439253328384.0, + "grad_norm": 0.095918638324787, + "language_loss": 1.01389623, + "learning_rate": 0.0009995108073331323, + "loss": 1.02991343, + "num_input_tokens_seen": 17626672, + "router_z_loss_mlp": 0.33862305, + "step": 227, + "time_per_iteration": 2.667628765106201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0158134, + "balance_loss_mlp": 1.5462923, + "epoch": 0.04386302424009234, + "flos": 507109330944.0, + "grad_norm": 0.08564981186298011, + "language_loss": 1.04024279, + "learning_rate": 0.0009994969325427309, + "loss": 1.05605614, + "num_input_tokens_seen": 17698624, + "router_z_loss_mlp": 0.35058594, + "step": 228, + "time_per_iteration": 2.6454501152038574 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01558795, + "balance_loss_mlp": 1.52224541, + "epoch": 0.04405540592535591, + "flos": 540432829440.0, + "grad_norm": 0.07744391701114339, + "language_loss": 1.00052619, + "learning_rate": 0.0009994828638304218, + "loss": 1.016114, + "num_input_tokens_seen": 17767760, + "router_z_loss_mlp": 0.36547852, + "step": 229, + "time_per_iteration": 2.6468071937561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01516137, + "balance_loss_mlp": 1.47794271, + "epoch": 0.04424778761061947, + "flos": 446136850944.0, + "grad_norm": 0.08052263902742013, + "language_loss": 1.06763554, + "learning_rate": 0.0009994686012016675, + "loss": 1.08279693, + "num_input_tokens_seen": 17833664, + "router_z_loss_mlp": 0.3815918, + "step": 230, + "time_per_iteration": 2.5467634201049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01483037, + "balance_loss_mlp": 1.44515228, + "epoch": 0.044440169295883035, + "flos": 700383435264.0, + "grad_norm": 0.05918307184238542, + "language_loss": 1.0518043, + "learning_rate": 0.000999454144662005, + "loss": 1.06663465, + "num_input_tokens_seen": 17908880, + "router_z_loss_mlp": 0.37866211, + "step": 231, + "time_per_iteration": 2.8704099655151367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01473358, + "balance_loss_mlp": 1.43549716, + "epoch": 0.044632550981146595, + "flos": 588055873536.0, + "grad_norm": 0.08626514264815018, + "language_loss": 0.99676436, + "learning_rate": 0.0009994394942170468, + "loss": 1.01149797, + "num_input_tokens_seen": 17978208, + "router_z_loss_mlp": 0.37866211, + "step": 232, + "time_per_iteration": 2.6578898429870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01461415, + "balance_loss_mlp": 1.4258194, + "epoch": 0.04482493266641016, + "flos": 554534525952.0, + "grad_norm": 0.07124765242066121, + "language_loss": 0.96965969, + "learning_rate": 0.0009994246498724808, + "loss": 0.98427379, + "num_input_tokens_seen": 18049296, + "router_z_loss_mlp": 0.35620117, + "step": 233, + "time_per_iteration": 2.7764015197753906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01463645, + "balance_loss_mlp": 1.42790616, + "epoch": 0.04501731435167372, + "flos": 722500646400.0, + "grad_norm": 0.07759597622956232, + "language_loss": 0.99069166, + "learning_rate": 0.00099940961163407, + "loss": 1.00532806, + "num_input_tokens_seen": 18123296, + "router_z_loss_mlp": 0.35766602, + "step": 234, + "time_per_iteration": 2.8431143760681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01454599, + "balance_loss_mlp": 1.42098188, + "epoch": 0.04520969603693728, + "flos": 511539784704.0, + "grad_norm": 0.05931413709293958, + "language_loss": 1.02564597, + "learning_rate": 0.0009993943795076528, + "loss": 1.04019189, + "num_input_tokens_seen": 18192784, + "router_z_loss_mlp": 0.33642578, + "step": 235, + "time_per_iteration": 2.645988702774048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01444244, + "balance_loss_mlp": 1.40936303, + "epoch": 0.04540207772220085, + "flos": 364854246912.0, + "grad_norm": 0.07280953320994132, + "language_loss": 1.04776168, + "learning_rate": 0.0009993789534991427, + "loss": 1.062204, + "num_input_tokens_seen": 18254064, + "router_z_loss_mlp": 0.34912109, + "step": 236, + "time_per_iteration": 2.4084837436676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01418385, + "balance_loss_mlp": 1.38390946, + "epoch": 0.045594459407464406, + "flos": 522407038464.0, + "grad_norm": 0.060943880380569936, + "language_loss": 0.99500269, + "learning_rate": 0.0009993633336145287, + "loss": 1.00918651, + "num_input_tokens_seen": 18325728, + "router_z_loss_mlp": 0.34472656, + "step": 237, + "time_per_iteration": 2.6044533252716064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01406135, + "balance_loss_mlp": 1.3730185, + "epoch": 0.04578684109272797, + "flos": 671442338304.0, + "grad_norm": 0.06747057459653658, + "language_loss": 1.03573179, + "learning_rate": 0.0009993475198598752, + "loss": 1.04979324, + "num_input_tokens_seen": 18408608, + "router_z_loss_mlp": 0.33129883, + "step": 238, + "time_per_iteration": 2.9948084354400635 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01383668, + "balance_loss_mlp": 1.35164809, + "epoch": 0.04597922277799153, + "flos": 541387321344.0, + "grad_norm": 0.07135856148897902, + "language_loss": 0.99909985, + "learning_rate": 0.0009993315122413212, + "loss": 1.01293659, + "num_input_tokens_seen": 18471920, + "router_z_loss_mlp": 0.32006836, + "step": 239, + "time_per_iteration": 2.5848827362060547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369111, + "balance_loss_mlp": 1.33773541, + "epoch": 0.0461716044632551, + "flos": 458732616192.0, + "grad_norm": 0.056000088810755834, + "language_loss": 1.0008105, + "learning_rate": 0.0009993153107650818, + "loss": 1.01450157, + "num_input_tokens_seen": 18540496, + "router_z_loss_mlp": 0.31347656, + "step": 240, + "time_per_iteration": 2.5492687225341797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338815, + "balance_loss_mlp": 1.31015706, + "epoch": 0.04636398614851866, + "flos": 455009342976.0, + "grad_norm": 0.06491754001609312, + "language_loss": 0.99534512, + "learning_rate": 0.0009992989154374468, + "loss": 1.00873327, + "num_input_tokens_seen": 18606944, + "router_z_loss_mlp": 0.28662109, + "step": 241, + "time_per_iteration": 2.511237621307373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01294622, + "balance_loss_mlp": 1.26833653, + "epoch": 0.046556367833782225, + "flos": 556558401024.0, + "grad_norm": 0.07592069792168304, + "language_loss": 1.06626534, + "learning_rate": 0.0009992823262647817, + "loss": 1.07921147, + "num_input_tokens_seen": 18679520, + "router_z_loss_mlp": 0.26293945, + "step": 242, + "time_per_iteration": 2.7618701457977295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01249282, + "balance_loss_mlp": 1.22240043, + "epoch": 0.046748749519045785, + "flos": 592625949696.0, + "grad_norm": 0.0687662987323222, + "language_loss": 1.00893593, + "learning_rate": 0.0009992655432535264, + "loss": 1.0214287, + "num_input_tokens_seen": 18756656, + "router_z_loss_mlp": 0.26879883, + "step": 243, + "time_per_iteration": 2.7471935749053955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01255015, + "balance_loss_mlp": 1.23083937, + "epoch": 0.04694113120430935, + "flos": 569596506624.0, + "grad_norm": 0.07373455055845594, + "language_loss": 1.0054853, + "learning_rate": 0.0009992485664101973, + "loss": 1.01803541, + "num_input_tokens_seen": 18829792, + "router_z_loss_mlp": 0.24169922, + "step": 244, + "time_per_iteration": 2.635344982147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01295554, + "balance_loss_mlp": 1.27291572, + "epoch": 0.04713351288957291, + "flos": 863401158144.0, + "grad_norm": 0.10584905626659928, + "language_loss": 1.03312445, + "learning_rate": 0.000999231395741385, + "loss": 1.04607987, + "num_input_tokens_seen": 18906864, + "router_z_loss_mlp": 0.22631836, + "step": 245, + "time_per_iteration": 3.093386173248291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0128706, + "balance_loss_mlp": 1.26464868, + "epoch": 0.04732589457483648, + "flos": 536961249792.0, + "grad_norm": 0.08844420521863233, + "language_loss": 1.01371169, + "learning_rate": 0.0009992140312537557, + "loss": 1.02658224, + "num_input_tokens_seen": 18973632, + "router_z_loss_mlp": 0.22412109, + "step": 246, + "time_per_iteration": 2.667579412460327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256359, + "balance_loss_mlp": 1.23515141, + "epoch": 0.04751827626010004, + "flos": 761566910976.0, + "grad_norm": 0.052835972446563725, + "language_loss": 0.9609164, + "learning_rate": 0.000999196472954051, + "loss": 0.97347999, + "num_input_tokens_seen": 19052944, + "router_z_loss_mlp": 0.2121582, + "step": 247, + "time_per_iteration": 2.9537084102630615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02369813, + "balance_loss_mlp": 2.16687083, + "epoch": 0.0477106579453636, + "flos": 1578961313280.0, + "grad_norm": 0.2151482568863758, + "language_loss": 0.79424852, + "learning_rate": 0.0009991787208490878, + "loss": 0.81794667, + "num_input_tokens_seen": 19286288, + "router_z_loss_mlp": 2.03125, + "step": 248, + "time_per_iteration": 5.758621454238892 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01289622, + "balance_loss_mlp": 1.27137113, + "epoch": 0.04790303963062716, + "flos": 457535195136.0, + "grad_norm": 0.10849969336884063, + "language_loss": 1.03316629, + "learning_rate": 0.0009991607749457578, + "loss": 1.04606247, + "num_input_tokens_seen": 19349296, + "router_z_loss_mlp": 0.18261719, + "step": 249, + "time_per_iteration": 2.5432913303375244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01334566, + "balance_loss_mlp": 1.31724536, + "epoch": 0.04809542131589073, + "flos": 782079266304.0, + "grad_norm": 0.08264534697846654, + "language_loss": 1.01180637, + "learning_rate": 0.0009991426352510286, + "loss": 1.02515209, + "num_input_tokens_seen": 19428416, + "router_z_loss_mlp": 0.17321777, + "step": 250, + "time_per_iteration": 3.1542766094207764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01351096, + "balance_loss_mlp": 1.33215368, + "epoch": 0.04828780300115429, + "flos": 558995503104.0, + "grad_norm": 0.06435857362074206, + "language_loss": 1.03307557, + "learning_rate": 0.0009991243017719422, + "loss": 1.04658651, + "num_input_tokens_seen": 19498688, + "router_z_loss_mlp": 0.18933105, + "step": 251, + "time_per_iteration": 2.693882942199707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01333217, + "balance_loss_mlp": 1.31485844, + "epoch": 0.048480184686417856, + "flos": 501682277376.0, + "grad_norm": 0.09276508096019526, + "language_loss": 0.97794825, + "learning_rate": 0.0009991057745156165, + "loss": 0.99128038, + "num_input_tokens_seen": 19567568, + "router_z_loss_mlp": 0.18347168, + "step": 252, + "time_per_iteration": 2.628873109817505 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01867297, + "balance_loss_mlp": 1.75514495, + "epoch": 0.048672566371681415, + "flos": 1535585430528.0, + "grad_norm": 0.16359674361847032, + "language_loss": 0.81910986, + "learning_rate": 0.0009990870534892446, + "loss": 0.8377828, + "num_input_tokens_seen": 19796368, + "router_z_loss_mlp": 1.125, + "step": 253, + "time_per_iteration": 5.060615062713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01285031, + "balance_loss_mlp": 1.26567185, + "epoch": 0.04886494805694498, + "flos": 537665458176.0, + "grad_norm": 0.07164286827098729, + "language_loss": 1.06546187, + "learning_rate": 0.0009990681387000943, + "loss": 1.07831216, + "num_input_tokens_seen": 19870480, + "router_z_loss_mlp": 0.19384766, + "step": 254, + "time_per_iteration": 2.783367395401001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01275754, + "balance_loss_mlp": 1.25606036, + "epoch": 0.04905732974220854, + "flos": 679841966592.0, + "grad_norm": 0.06618046133348403, + "language_loss": 1.01404011, + "learning_rate": 0.0009990490301555093, + "loss": 1.02679765, + "num_input_tokens_seen": 19956288, + "router_z_loss_mlp": 0.19689941, + "step": 255, + "time_per_iteration": 2.9520761966705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01936632, + "balance_loss_mlp": 1.86796737, + "epoch": 0.04924971142747211, + "flos": 1420408949760.0, + "grad_norm": 0.31562964738653715, + "language_loss": 0.79215157, + "learning_rate": 0.0009990297278629078, + "loss": 0.81151783, + "num_input_tokens_seen": 20180080, + "router_z_loss_mlp": 0.6875, + "step": 256, + "time_per_iteration": 4.825209856033325 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01615246, + "balance_loss_mlp": 1.55344784, + "epoch": 0.04944209311273567, + "flos": 1557202074624.0, + "grad_norm": 0.16937574338078817, + "language_loss": 0.79242742, + "learning_rate": 0.000999010231829784, + "loss": 0.80857986, + "num_input_tokens_seen": 20413456, + "router_z_loss_mlp": 0.6171875, + "step": 257, + "time_per_iteration": 4.995501518249512 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0163115, + "balance_loss_mlp": 1.58422887, + "epoch": 0.04963447479799923, + "flos": 1569985514496.0, + "grad_norm": 0.13524925240989144, + "language_loss": 0.69975883, + "learning_rate": 0.0009989905420637066, + "loss": 0.71607035, + "num_input_tokens_seen": 20644736, + "router_z_loss_mlp": 0.46875, + "step": 258, + "time_per_iteration": 4.841471910476685 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01272668, + "balance_loss_mlp": 1.24463022, + "epoch": 0.049826856483262794, + "flos": 625063357440.0, + "grad_norm": 0.06365504505971183, + "language_loss": 0.95603192, + "learning_rate": 0.0009989706585723202, + "loss": 0.96875864, + "num_input_tokens_seen": 20719040, + "router_z_loss_mlp": 0.28076172, + "step": 259, + "time_per_iteration": 2.7635786533355713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0130022, + "balance_loss_mlp": 1.27020288, + "epoch": 0.05001923816852635, + "flos": 503912765952.0, + "grad_norm": 0.062257698278494894, + "language_loss": 1.01846027, + "learning_rate": 0.0009989505813633442, + "loss": 1.03146255, + "num_input_tokens_seen": 20789376, + "router_z_loss_mlp": 0.29980469, + "step": 260, + "time_per_iteration": 2.6451833248138428 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131611, + "balance_loss_mlp": 1.28101516, + "epoch": 0.05021161985378992, + "flos": 587066476032.0, + "grad_norm": 0.06290514068599455, + "language_loss": 1.01911807, + "learning_rate": 0.000998930310444573, + "loss": 1.03227913, + "num_input_tokens_seen": 20857856, + "router_z_loss_mlp": 0.35083008, + "step": 261, + "time_per_iteration": 2.6989662647247314 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01324978, + "balance_loss_mlp": 1.2880708, + "epoch": 0.05040400153905348, + "flos": 633029409792.0, + "grad_norm": 0.0625839964239575, + "language_loss": 1.00387836, + "learning_rate": 0.0009989098458238765, + "loss": 1.01712811, + "num_input_tokens_seen": 20931232, + "router_z_loss_mlp": 0.36914062, + "step": 262, + "time_per_iteration": 2.7581043243408203 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01319841, + "balance_loss_mlp": 1.28395867, + "epoch": 0.050596383224317046, + "flos": 553344307200.0, + "grad_norm": 0.06067150197267865, + "language_loss": 0.99905968, + "learning_rate": 0.0009988891875091998, + "loss": 1.01225805, + "num_input_tokens_seen": 21012672, + "router_z_loss_mlp": 0.35913086, + "step": 263, + "time_per_iteration": 2.7601842880249023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0131413, + "balance_loss_mlp": 1.27793837, + "epoch": 0.050788764909580605, + "flos": 549389689344.0, + "grad_norm": 0.07440292928735547, + "language_loss": 0.94277728, + "learning_rate": 0.0009988683355085636, + "loss": 0.95591855, + "num_input_tokens_seen": 21088592, + "router_z_loss_mlp": 0.36206055, + "step": 264, + "time_per_iteration": 2.7262909412384033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01277315, + "balance_loss_mlp": 1.24248254, + "epoch": 0.05098114659484417, + "flos": 604812870144.0, + "grad_norm": 0.06984595792035174, + "language_loss": 1.02861905, + "learning_rate": 0.000998847289830063, + "loss": 1.04139221, + "num_input_tokens_seen": 21169840, + "router_z_loss_mlp": 0.34838867, + "step": 265, + "time_per_iteration": 2.8318397998809814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01256298, + "balance_loss_mlp": 1.22272849, + "epoch": 0.05117352828010773, + "flos": 438317775360.0, + "grad_norm": 0.08677906198544101, + "language_loss": 0.95779377, + "learning_rate": 0.0009988260504818682, + "loss": 0.9703567, + "num_input_tokens_seen": 21236144, + "router_z_loss_mlp": 0.3359375, + "step": 266, + "time_per_iteration": 2.5212388038635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01220367, + "balance_loss_mlp": 1.19046903, + "epoch": 0.0513659099653713, + "flos": 504784300032.0, + "grad_norm": 0.09456939977029206, + "language_loss": 1.01958096, + "learning_rate": 0.000998804617472226, + "loss": 1.03178465, + "num_input_tokens_seen": 21304864, + "router_z_loss_mlp": 0.29858398, + "step": 267, + "time_per_iteration": 2.649739980697632 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01169139, + "balance_loss_mlp": 1.14131606, + "epoch": 0.05155829165063486, + "flos": 695183344128.0, + "grad_norm": 0.07125411147685125, + "language_loss": 0.97574937, + "learning_rate": 0.0009987829908094568, + "loss": 0.98744082, + "num_input_tokens_seen": 21377504, + "router_z_loss_mlp": 0.27856445, + "step": 268, + "time_per_iteration": 2.816098690032959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119703, + "balance_loss_mlp": 1.09379935, + "epoch": 0.051750673335898424, + "flos": 1347751830528.0, + "grad_norm": 0.06583247177587333, + "language_loss": 1.04151332, + "learning_rate": 0.0009987611705019569, + "loss": 1.05271029, + "num_input_tokens_seen": 21463840, + "router_z_loss_mlp": 0.25927734, + "step": 269, + "time_per_iteration": 4.478148460388184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103433, + "balance_loss_mlp": 1.08008027, + "epoch": 0.051943055021161984, + "flos": 489362936832.0, + "grad_norm": 0.06787757239342199, + "language_loss": 1.02481639, + "learning_rate": 0.0009987391565581978, + "loss": 1.03585076, + "num_input_tokens_seen": 21531184, + "router_z_loss_mlp": 0.23364258, + "step": 270, + "time_per_iteration": 2.5662009716033936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111004, + "balance_loss_mlp": 1.08859241, + "epoch": 0.05213543670642555, + "flos": 545504882688.0, + "grad_norm": 0.08198896814085149, + "language_loss": 0.9504528, + "learning_rate": 0.000998716948986726, + "loss": 0.96156287, + "num_input_tokens_seen": 21612224, + "router_z_loss_mlp": 0.22424316, + "step": 271, + "time_per_iteration": 2.7815349102020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158552, + "balance_loss_mlp": 1.13697529, + "epoch": 0.05232781839168911, + "flos": 603285179904.0, + "grad_norm": 0.07646156534985457, + "language_loss": 0.97641921, + "learning_rate": 0.0009986945477961633, + "loss": 0.9880048, + "num_input_tokens_seen": 21681024, + "router_z_loss_mlp": 0.21569824, + "step": 272, + "time_per_iteration": 2.694547414779663 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188724, + "balance_loss_mlp": 1.16735017, + "epoch": 0.052520200076952676, + "flos": 538218307584.0, + "grad_norm": 0.07381807258867126, + "language_loss": 1.02498066, + "learning_rate": 0.0009986719529952066, + "loss": 1.03686786, + "num_input_tokens_seen": 21761616, + "router_z_loss_mlp": 0.21386719, + "step": 273, + "time_per_iteration": 2.8339192867279053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0121752, + "balance_loss_mlp": 1.19785035, + "epoch": 0.052712581762216236, + "flos": 463148513280.0, + "grad_norm": 0.0738352941440963, + "language_loss": 1.01808548, + "learning_rate": 0.000998649164592628, + "loss": 1.03026068, + "num_input_tokens_seen": 21828416, + "router_z_loss_mlp": 0.19677734, + "step": 274, + "time_per_iteration": 2.60577130317688 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236713, + "balance_loss_mlp": 1.21763909, + "epoch": 0.0529049634474798, + "flos": 547749927936.0, + "grad_norm": 0.08134169766286939, + "language_loss": 0.99272913, + "learning_rate": 0.0009986261825972748, + "loss": 1.00509632, + "num_input_tokens_seen": 21901600, + "router_z_loss_mlp": 0.19055176, + "step": 275, + "time_per_iteration": 2.652561664581299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196689, + "balance_loss_mlp": 1.17834246, + "epoch": 0.05309734513274336, + "flos": 617727320064.0, + "grad_norm": 0.09111845604121613, + "language_loss": 1.01860571, + "learning_rate": 0.000998603007018069, + "loss": 1.03057253, + "num_input_tokens_seen": 21979312, + "router_z_loss_mlp": 0.18334961, + "step": 276, + "time_per_iteration": 2.8293774127960205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011443, + "balance_loss_mlp": 1.1273365, + "epoch": 0.05328972681800693, + "flos": 605220304896.0, + "grad_norm": 0.07377841396756965, + "language_loss": 0.99345076, + "learning_rate": 0.0009985796378640089, + "loss": 1.00489378, + "num_input_tokens_seen": 22053776, + "router_z_loss_mlp": 0.16955566, + "step": 277, + "time_per_iteration": 2.694716215133667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098185, + "balance_loss_mlp": 1.08067346, + "epoch": 0.05348210850327049, + "flos": 604197411840.0, + "grad_norm": 0.07074934963985437, + "language_loss": 0.99532163, + "learning_rate": 0.0009985560751441665, + "loss": 1.00630355, + "num_input_tokens_seen": 22134304, + "router_z_loss_mlp": 0.1751709, + "step": 278, + "time_per_iteration": 2.798563241958618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095446, + "balance_loss_mlp": 1.07736206, + "epoch": 0.053674490188534055, + "flos": 630480236544.0, + "grad_norm": 0.054749659326078955, + "language_loss": 1.01733184, + "learning_rate": 0.00099853231886769, + "loss": 1.02828622, + "num_input_tokens_seen": 22212896, + "router_z_loss_mlp": 0.1809082, + "step": 279, + "time_per_iteration": 2.780940532684326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134885, + "balance_loss_mlp": 1.11744475, + "epoch": 0.053866871873797614, + "flos": 478939433472.0, + "grad_norm": 0.06375435082524677, + "language_loss": 1.01461124, + "learning_rate": 0.0009985083690438024, + "loss": 1.02595997, + "num_input_tokens_seen": 22287216, + "router_z_loss_mlp": 0.17443848, + "step": 280, + "time_per_iteration": 2.68762469291687 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145965, + "balance_loss_mlp": 1.12913251, + "epoch": 0.054059253559061174, + "flos": 787673645568.0, + "grad_norm": 0.07384801764192533, + "language_loss": 0.92380941, + "learning_rate": 0.0009984842256818016, + "loss": 0.93526906, + "num_input_tokens_seen": 22370864, + "router_z_loss_mlp": 0.16845703, + "step": 281, + "time_per_iteration": 3.054032325744629 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114791, + "balance_loss_mlp": 1.13080359, + "epoch": 0.05425163524432474, + "flos": 628076630016.0, + "grad_norm": 0.082175996598207, + "language_loss": 1.0314945, + "learning_rate": 0.0009984598887910613, + "loss": 1.04297376, + "num_input_tokens_seen": 22440080, + "router_z_loss_mlp": 0.17114258, + "step": 282, + "time_per_iteration": 2.7095611095428467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144379, + "balance_loss_mlp": 1.12627149, + "epoch": 0.0544440169295883, + "flos": 615453161472.0, + "grad_norm": 0.06813866095032944, + "language_loss": 0.9902432, + "learning_rate": 0.0009984353583810297, + "loss": 1.00168693, + "num_input_tokens_seen": 22517936, + "router_z_loss_mlp": 0.18103027, + "step": 283, + "time_per_iteration": 2.804438829421997 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124262, + "balance_loss_mlp": 1.10624945, + "epoch": 0.05463639861485187, + "flos": 647471549952.0, + "grad_norm": 0.10003204141391345, + "language_loss": 1.01340103, + "learning_rate": 0.0009984106344612302, + "loss": 1.02464366, + "num_input_tokens_seen": 22590480, + "router_z_loss_mlp": 0.18017578, + "step": 284, + "time_per_iteration": 2.7521376609802246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109552, + "balance_loss_mlp": 1.07819879, + "epoch": 0.054828780300115426, + "flos": 796845883392.0, + "grad_norm": 0.07143310654982075, + "language_loss": 0.96421391, + "learning_rate": 0.0009983857170412615, + "loss": 0.97516906, + "num_input_tokens_seen": 22668144, + "router_z_loss_mlp": 0.17321777, + "step": 285, + "time_per_iteration": 2.9796621799468994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089942, + "balance_loss_mlp": 1.07363439, + "epoch": 0.05502116198537899, + "flos": 549414420480.0, + "grad_norm": 0.05224422052371224, + "language_loss": 0.95713383, + "learning_rate": 0.000998360606130798, + "loss": 0.96803325, + "num_input_tokens_seen": 22749648, + "router_z_loss_mlp": 0.16308594, + "step": 286, + "time_per_iteration": 2.7950801849365234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02908189, + "balance_loss_mlp": 2.83799911, + "epoch": 0.05521354367064255, + "flos": 1406967791616.0, + "grad_norm": 0.233188183772104, + "language_loss": 0.69073117, + "learning_rate": 0.0009983353017395877, + "loss": 0.71981305, + "num_input_tokens_seen": 22982752, + "router_z_loss_mlp": 0.703125, + "step": 287, + "time_per_iteration": 4.876653432846069 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179574, + "balance_loss_mlp": 1.1627655, + "epoch": 0.05540592535590612, + "flos": 645123197952.0, + "grad_norm": 0.17417830683261867, + "language_loss": 1.0204829, + "learning_rate": 0.0009983098038774552, + "loss": 1.03227878, + "num_input_tokens_seen": 23053584, + "router_z_loss_mlp": 0.16821289, + "step": 288, + "time_per_iteration": 2.7781550884246826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02101154, + "balance_loss_mlp": 2.07540464, + "epoch": 0.05559830704116968, + "flos": 1510293413376.0, + "grad_norm": 0.1730100464590254, + "language_loss": 0.78170228, + "learning_rate": 0.0009982841125542993, + "loss": 0.80271375, + "num_input_tokens_seen": 23280256, + "router_z_loss_mlp": 0.2578125, + "step": 289, + "time_per_iteration": 4.801970481872559 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01338926, + "balance_loss_mlp": 1.32332134, + "epoch": 0.055790688726433245, + "flos": 508078379520.0, + "grad_norm": 0.11288123874753296, + "language_loss": 0.99586821, + "learning_rate": 0.0009982582277800948, + "loss": 1.00925756, + "num_input_tokens_seen": 23345760, + "router_z_loss_mlp": 0.15588379, + "step": 290, + "time_per_iteration": 2.6019012928009033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01376714, + "balance_loss_mlp": 1.36076403, + "epoch": 0.055983070411696804, + "flos": 657570576384.0, + "grad_norm": 0.11158393407579077, + "language_loss": 1.06464982, + "learning_rate": 0.0009982321495648908, + "loss": 1.07841706, + "num_input_tokens_seen": 23420720, + "router_z_loss_mlp": 0.15942383, + "step": 291, + "time_per_iteration": 2.7833075523376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01281101, + "balance_loss_mlp": 1.26441216, + "epoch": 0.05617545209696037, + "flos": 587051919360.0, + "grad_norm": 0.091490024999748, + "language_loss": 0.97375935, + "learning_rate": 0.0009982058779188115, + "loss": 0.98657036, + "num_input_tokens_seen": 23492576, + "router_z_loss_mlp": 0.16699219, + "step": 292, + "time_per_iteration": 2.700998067855835 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223751, + "balance_loss_mlp": 1.20634639, + "epoch": 0.05636783378222393, + "flos": 611331217920.0, + "grad_norm": 0.09093545163733599, + "language_loss": 1.05090272, + "learning_rate": 0.0009981794128520567, + "loss": 1.06314015, + "num_input_tokens_seen": 23569824, + "router_z_loss_mlp": 0.17431641, + "step": 293, + "time_per_iteration": 2.769562244415283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01172918, + "balance_loss_mlp": 1.15501237, + "epoch": 0.0565602154674875, + "flos": 667847102976.0, + "grad_norm": 0.08200667246549262, + "language_loss": 1.02219713, + "learning_rate": 0.000998152754374901, + "loss": 1.03392649, + "num_input_tokens_seen": 23649984, + "router_z_loss_mlp": 0.17919922, + "step": 294, + "time_per_iteration": 2.8421483039855957 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140496, + "balance_loss_mlp": 1.12121987, + "epoch": 0.05675259715275106, + "flos": 616963474944.0, + "grad_norm": 0.06298459153201627, + "language_loss": 0.97706711, + "learning_rate": 0.0009981259024976943, + "loss": 0.98847204, + "num_input_tokens_seen": 23722032, + "router_z_loss_mlp": 0.19250488, + "step": 295, + "time_per_iteration": 2.709536552429199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131247, + "balance_loss_mlp": 1.11139894, + "epoch": 0.05694497883801462, + "flos": 751424214528.0, + "grad_norm": 0.13011693222478776, + "language_loss": 0.96307456, + "learning_rate": 0.0009980988572308612, + "loss": 0.97438705, + "num_input_tokens_seen": 23797376, + "router_z_loss_mlp": 0.19848633, + "step": 296, + "time_per_iteration": 2.9606993198394775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125313, + "balance_loss_mlp": 1.10492802, + "epoch": 0.05713736052327818, + "flos": 711669708288.0, + "grad_norm": 0.06808560063607492, + "language_loss": 0.9959082, + "learning_rate": 0.0009980716185849015, + "loss": 1.00716126, + "num_input_tokens_seen": 23880496, + "router_z_loss_mlp": 0.20385742, + "step": 297, + "time_per_iteration": 2.952467203140259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133548, + "balance_loss_mlp": 1.11424804, + "epoch": 0.05732974220854175, + "flos": 468737100288.0, + "grad_norm": 0.05570922928007862, + "language_loss": 0.95103967, + "learning_rate": 0.0009980441865703904, + "loss": 0.9623751, + "num_input_tokens_seen": 23950016, + "router_z_loss_mlp": 0.19299316, + "step": 298, + "time_per_iteration": 2.6629996299743652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125947, + "balance_loss_mlp": 1.10630131, + "epoch": 0.05752212389380531, + "flos": 601143441408.0, + "grad_norm": 0.06175770353433084, + "language_loss": 1.038656, + "learning_rate": 0.000998016561197978, + "loss": 1.04991555, + "num_input_tokens_seen": 24020064, + "router_z_loss_mlp": 0.19628906, + "step": 299, + "time_per_iteration": 2.7027034759521484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122899, + "balance_loss_mlp": 1.10499382, + "epoch": 0.057714505579068875, + "flos": 678344799744.0, + "grad_norm": 0.07709513760197055, + "language_loss": 0.95715761, + "learning_rate": 0.0009979887424783895, + "loss": 0.96838653, + "num_input_tokens_seen": 24095360, + "router_z_loss_mlp": 0.17907715, + "step": 300, + "time_per_iteration": 2.8467562198638916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122592, + "balance_loss_mlp": 1.10369694, + "epoch": 0.057906887264332435, + "flos": 595604316672.0, + "grad_norm": 0.05754387138467597, + "language_loss": 0.94804943, + "learning_rate": 0.0009979607304224248, + "loss": 0.95927536, + "num_input_tokens_seen": 24164608, + "router_z_loss_mlp": 0.18908691, + "step": 301, + "time_per_iteration": 2.7457566261291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135958, + "balance_loss_mlp": 1.11577594, + "epoch": 0.058099268949596, + "flos": 551855904768.0, + "grad_norm": 0.06951393564289957, + "language_loss": 1.02452385, + "learning_rate": 0.000997932525040959, + "loss": 1.03588343, + "num_input_tokens_seen": 24233840, + "router_z_loss_mlp": 0.20166016, + "step": 302, + "time_per_iteration": 2.670464038848877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123814, + "balance_loss_mlp": 1.10513425, + "epoch": 0.05829165063485956, + "flos": 507906671616.0, + "grad_norm": 0.06408930588753382, + "language_loss": 1.04041958, + "learning_rate": 0.000997904126344943, + "loss": 1.05165768, + "num_input_tokens_seen": 24302928, + "router_z_loss_mlp": 0.18676758, + "step": 303, + "time_per_iteration": 2.654275417327881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122557, + "balance_loss_mlp": 1.10432982, + "epoch": 0.05848403232012313, + "flos": 614949774336.0, + "grad_norm": 0.10902949066110783, + "language_loss": 1.00108004, + "learning_rate": 0.0009978755343454018, + "loss": 1.0123055, + "num_input_tokens_seen": 24377024, + "router_z_loss_mlp": 0.18212891, + "step": 304, + "time_per_iteration": 2.7061922550201416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118016, + "balance_loss_mlp": 1.10034943, + "epoch": 0.05867641400538669, + "flos": 499835902464.0, + "grad_norm": 0.07196511907519268, + "language_loss": 1.01183403, + "learning_rate": 0.0009978467490534355, + "loss": 1.02301419, + "num_input_tokens_seen": 24442736, + "router_z_loss_mlp": 0.17663574, + "step": 305, + "time_per_iteration": 2.5658843517303467 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118418, + "balance_loss_mlp": 1.09971452, + "epoch": 0.05886879569065025, + "flos": 531019072512.0, + "grad_norm": 0.05577021807863236, + "language_loss": 0.98775607, + "learning_rate": 0.00099781777048022, + "loss": 0.99894023, + "num_input_tokens_seen": 24514800, + "router_z_loss_mlp": 0.18713379, + "step": 306, + "time_per_iteration": 2.688661813735962 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112614, + "balance_loss_mlp": 1.10866416, + "epoch": 0.05906117737591381, + "flos": 488811497472.0, + "grad_norm": 0.06489613907432343, + "language_loss": 0.99682212, + "learning_rate": 0.0009977885986370057, + "loss": 1.00808358, + "num_input_tokens_seen": 24581648, + "router_z_loss_mlp": 0.17480469, + "step": 307, + "time_per_iteration": 2.527008056640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01129188, + "balance_loss_mlp": 1.11242771, + "epoch": 0.05925355906117737, + "flos": 591213150720.0, + "grad_norm": 0.060579194597163814, + "language_loss": 0.94911426, + "learning_rate": 0.000997759233535118, + "loss": 0.96040612, + "num_input_tokens_seen": 24658864, + "router_z_loss_mlp": 0.16772461, + "step": 308, + "time_per_iteration": 2.768683433532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108276, + "balance_loss_mlp": 1.09052539, + "epoch": 0.05944594074644094, + "flos": 563373522432.0, + "grad_norm": 0.074144120767366, + "language_loss": 1.01706028, + "learning_rate": 0.0009977296751859576, + "loss": 1.02814317, + "num_input_tokens_seen": 24735808, + "router_z_loss_mlp": 0.17749023, + "step": 309, + "time_per_iteration": 2.710550308227539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109964, + "balance_loss_mlp": 1.0817585, + "epoch": 0.0596383224317045, + "flos": 538483147776.0, + "grad_norm": 0.1012520362466171, + "language_loss": 1.03562367, + "learning_rate": 0.0009976999236009998, + "loss": 1.04662001, + "num_input_tokens_seen": 24807744, + "router_z_loss_mlp": 0.17895508, + "step": 310, + "time_per_iteration": 2.7346065044403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095396, + "balance_loss_mlp": 1.07697809, + "epoch": 0.059830704116968066, + "flos": 560684726784.0, + "grad_norm": 0.05903807060939984, + "language_loss": 1.05193245, + "learning_rate": 0.0009976699787917955, + "loss": 1.06288636, + "num_input_tokens_seen": 24876640, + "router_z_loss_mlp": 0.18408203, + "step": 311, + "time_per_iteration": 2.737165689468384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.04018029, + "balance_loss_mlp": 3.94440532, + "epoch": 0.060023085802231625, + "flos": 1569759962112.0, + "grad_norm": 0.34396821433057967, + "language_loss": 0.73442996, + "learning_rate": 0.00099763984076997, + "loss": 0.77461016, + "num_input_tokens_seen": 25110864, + "router_z_loss_mlp": 0.734375, + "step": 312, + "time_per_iteration": 4.990010976791382 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130575, + "balance_loss_mlp": 1.11010623, + "epoch": 0.06021546748749519, + "flos": 482415395328.0, + "grad_norm": 0.18656347991450223, + "language_loss": 0.97164261, + "learning_rate": 0.0009976095095472243, + "loss": 0.98294836, + "num_input_tokens_seen": 25179328, + "router_z_loss_mlp": 0.20458984, + "step": 313, + "time_per_iteration": 2.5596373081207275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143867, + "balance_loss_mlp": 1.12198031, + "epoch": 0.06040784917275875, + "flos": 619889407488.0, + "grad_norm": 0.10017394493353984, + "language_loss": 0.98154747, + "learning_rate": 0.0009975789851353334, + "loss": 0.9929862, + "num_input_tokens_seen": 25254128, + "router_z_loss_mlp": 0.21911621, + "step": 314, + "time_per_iteration": 2.783092498779297 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113993, + "balance_loss_mlp": 1.11832976, + "epoch": 0.06060023085802232, + "flos": 483292721664.0, + "grad_norm": 0.12837029886330253, + "language_loss": 1.00706339, + "learning_rate": 0.0009975482675461487, + "loss": 1.01846266, + "num_input_tokens_seen": 25324624, + "router_z_loss_mlp": 0.21594238, + "step": 315, + "time_per_iteration": 2.6375765800476074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128184, + "balance_loss_mlp": 1.10697675, + "epoch": 0.06079261254328588, + "flos": 581620483584.0, + "grad_norm": 0.07139597701291463, + "language_loss": 0.9800331, + "learning_rate": 0.0009975173567915952, + "loss": 0.99131489, + "num_input_tokens_seen": 25393648, + "router_z_loss_mlp": 0.21228027, + "step": 316, + "time_per_iteration": 2.680223226547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116284, + "balance_loss_mlp": 1.09438515, + "epoch": 0.060984994228549444, + "flos": 687492306432.0, + "grad_norm": 0.12898022133672052, + "language_loss": 0.92624593, + "learning_rate": 0.000997486252883674, + "loss": 0.93740869, + "num_input_tokens_seen": 25469152, + "router_z_loss_mlp": 0.21887207, + "step": 317, + "time_per_iteration": 2.835162878036499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104615, + "balance_loss_mlp": 1.08325243, + "epoch": 0.061177375913813004, + "flos": 1314284327424.0, + "grad_norm": 0.06442728945451602, + "language_loss": 0.97186124, + "learning_rate": 0.0009974549558344602, + "loss": 0.98290741, + "num_input_tokens_seen": 25560944, + "router_z_loss_mlp": 0.21350098, + "step": 318, + "time_per_iteration": 3.6293551921844482 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105129, + "balance_loss_mlp": 1.08439815, + "epoch": 0.06136975759907657, + "flos": 574072040448.0, + "grad_norm": 0.08131052095693254, + "language_loss": 1.07145, + "learning_rate": 0.000997423465656105, + "loss": 1.08250129, + "num_input_tokens_seen": 25631424, + "router_z_loss_mlp": 0.20715332, + "step": 319, + "time_per_iteration": 2.7070071697235107 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101786, + "balance_loss_mlp": 1.08168781, + "epoch": 0.06156213928434013, + "flos": 527281242624.0, + "grad_norm": 0.059301156484267634, + "language_loss": 1.04424822, + "learning_rate": 0.0009973917823608335, + "loss": 1.0552659, + "num_input_tokens_seen": 25698176, + "router_z_loss_mlp": 0.20092773, + "step": 320, + "time_per_iteration": 2.6225128173828125 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110531, + "balance_loss_mlp": 1.0897882, + "epoch": 0.061754520969603696, + "flos": 495238123008.0, + "grad_norm": 0.05387649814829365, + "language_loss": 0.98383266, + "learning_rate": 0.0009973599059609462, + "loss": 0.9949379, + "num_input_tokens_seen": 25773472, + "router_z_loss_mlp": 0.20739746, + "step": 321, + "time_per_iteration": 2.692152261734009 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107693, + "balance_loss_mlp": 1.08798778, + "epoch": 0.061946902654867256, + "flos": 439839673344.0, + "grad_norm": 0.06112812680296507, + "language_loss": 0.9711749, + "learning_rate": 0.000997327836468819, + "loss": 0.98225188, + "num_input_tokens_seen": 25841088, + "router_z_loss_mlp": 0.19702148, + "step": 322, + "time_per_iteration": 2.5772383213043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110285, + "balance_loss_mlp": 1.0900557, + "epoch": 0.06213928434013082, + "flos": 598490961408.0, + "grad_norm": 0.0645434874295678, + "language_loss": 0.9942351, + "learning_rate": 0.000997295573896902, + "loss": 1.00533807, + "num_input_tokens_seen": 25919424, + "router_z_loss_mlp": 0.20239258, + "step": 323, + "time_per_iteration": 2.839282274246216 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02259253, + "balance_loss_mlp": 2.20088792, + "epoch": 0.06233166602539438, + "flos": 1449393716736.0, + "grad_norm": 0.19547826226404627, + "language_loss": 0.8119604, + "learning_rate": 0.000997263118257721, + "loss": 0.83455294, + "num_input_tokens_seen": 26135504, + "router_z_loss_mlp": 0.58203125, + "step": 324, + "time_per_iteration": 4.67440938949585 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01888161, + "balance_loss_mlp": 1.83246601, + "epoch": 0.06252404771065795, + "flos": 1462504453632.0, + "grad_norm": 0.11962022052509429, + "language_loss": 0.78571939, + "learning_rate": 0.0009972304695638763, + "loss": 0.80460101, + "num_input_tokens_seen": 26358880, + "router_z_loss_mlp": 0.55859375, + "step": 325, + "time_per_iteration": 4.860283136367798 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177486, + "balance_loss_mlp": 1.15595722, + "epoch": 0.06271642939592151, + "flos": 464059335168.0, + "grad_norm": 0.06272096910143152, + "language_loss": 0.93621421, + "learning_rate": 0.000997197627828043, + "loss": 0.94798911, + "num_input_tokens_seen": 26425888, + "router_z_loss_mlp": 0.2154541, + "step": 326, + "time_per_iteration": 2.5594961643218994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01205877, + "balance_loss_mlp": 1.18165386, + "epoch": 0.06290881108118507, + "flos": 532111776768.0, + "grad_norm": 0.08849931028565244, + "language_loss": 0.89414704, + "learning_rate": 0.0009971645930629716, + "loss": 0.90620589, + "num_input_tokens_seen": 26500656, + "router_z_loss_mlp": 0.2421875, + "step": 327, + "time_per_iteration": 2.7163310050964355 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01223238, + "balance_loss_mlp": 1.19748878, + "epoch": 0.06310119276644863, + "flos": 673262572032.0, + "grad_norm": 0.09892100413683627, + "language_loss": 1.02883804, + "learning_rate": 0.0009971313652814872, + "loss": 1.04107046, + "num_input_tokens_seen": 26577408, + "router_z_loss_mlp": 0.25769043, + "step": 328, + "time_per_iteration": 2.7786266803741455 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228803, + "balance_loss_mlp": 1.20175433, + "epoch": 0.0632935744517122, + "flos": 770404497408.0, + "grad_norm": 0.06852265531332852, + "language_loss": 0.99799907, + "learning_rate": 0.0009970979444964903, + "loss": 1.01028717, + "num_input_tokens_seen": 26652048, + "router_z_loss_mlp": 0.27050781, + "step": 329, + "time_per_iteration": 2.952498197555542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235649, + "balance_loss_mlp": 1.2062993, + "epoch": 0.06348595613697576, + "flos": 561649393152.0, + "grad_norm": 0.09680127661829774, + "language_loss": 1.0121367, + "learning_rate": 0.0009970643307209556, + "loss": 1.02449322, + "num_input_tokens_seen": 26728192, + "router_z_loss_mlp": 0.29296875, + "step": 330, + "time_per_iteration": 2.78190541267395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01240935, + "balance_loss_mlp": 1.20970178, + "epoch": 0.06367833782223932, + "flos": 675891730944.0, + "grad_norm": 0.08786526055569537, + "language_loss": 0.9788332, + "learning_rate": 0.0009970305239679334, + "loss": 0.99124253, + "num_input_tokens_seen": 26798016, + "router_z_loss_mlp": 0.31201172, + "step": 331, + "time_per_iteration": 2.805845022201538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228576, + "balance_loss_mlp": 1.19891691, + "epoch": 0.06387071950750288, + "flos": 495035891712.0, + "grad_norm": 0.10390832636325384, + "language_loss": 1.03124022, + "learning_rate": 0.0009969965242505483, + "loss": 1.04352593, + "num_input_tokens_seen": 26867536, + "router_z_loss_mlp": 0.29614258, + "step": 332, + "time_per_iteration": 2.676711082458496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01207199, + "balance_loss_mlp": 1.1777302, + "epoch": 0.06406310119276645, + "flos": 533170985472.0, + "grad_norm": 0.07105898063788767, + "language_loss": 0.98331362, + "learning_rate": 0.0009969623315820007, + "loss": 0.99538565, + "num_input_tokens_seen": 26941216, + "router_z_loss_mlp": 0.29418945, + "step": 333, + "time_per_iteration": 2.6556739807128906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118815, + "balance_loss_mlp": 1.16106582, + "epoch": 0.06425548287803001, + "flos": 455940513792.0, + "grad_norm": 0.08067516684621483, + "language_loss": 0.99160993, + "learning_rate": 0.000996927945975565, + "loss": 1.0034914, + "num_input_tokens_seen": 27006560, + "router_z_loss_mlp": 0.27124023, + "step": 334, + "time_per_iteration": 2.5398526191711426 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147495, + "balance_loss_mlp": 1.1214596, + "epoch": 0.06444786456329357, + "flos": 559817574912.0, + "grad_norm": 0.08169715789363684, + "language_loss": 0.96174645, + "learning_rate": 0.0009968933674445906, + "loss": 0.97322142, + "num_input_tokens_seen": 27076400, + "router_z_loss_mlp": 0.26062012, + "step": 335, + "time_per_iteration": 2.6592860221862793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112932, + "balance_loss_mlp": 1.0879097, + "epoch": 0.06464024624855713, + "flos": 665769383424.0, + "grad_norm": 0.07104021966044574, + "language_loss": 0.97756392, + "learning_rate": 0.0009968585960025028, + "loss": 0.98869324, + "num_input_tokens_seen": 27158672, + "router_z_loss_mlp": 0.25036621, + "step": 336, + "time_per_iteration": 2.9279658794403076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01860024, + "balance_loss_mlp": 1.84323907, + "epoch": 0.0648326279338207, + "flos": 1520578704384.0, + "grad_norm": 0.14426901756633248, + "language_loss": 0.77653188, + "learning_rate": 0.0009968236316628006, + "loss": 0.7951321, + "num_input_tokens_seen": 27380592, + "router_z_loss_mlp": 0.16796875, + "step": 337, + "time_per_iteration": 4.810914993286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101739, + "balance_loss_mlp": 1.07948256, + "epoch": 0.06502500961908426, + "flos": 1142872768512.0, + "grad_norm": 0.058812216055980165, + "language_loss": 0.95864177, + "learning_rate": 0.0009967884744390583, + "loss": 0.96965921, + "num_input_tokens_seen": 27469984, + "router_z_loss_mlp": 0.22265625, + "step": 338, + "time_per_iteration": 3.512282371520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146504, + "balance_loss_mlp": 1.12267399, + "epoch": 0.06521739130434782, + "flos": 582339248640.0, + "grad_norm": 0.10793578588091769, + "language_loss": 0.97449529, + "learning_rate": 0.0009967531243449256, + "loss": 0.98596036, + "num_input_tokens_seen": 27543904, + "router_z_loss_mlp": 0.23828125, + "step": 339, + "time_per_iteration": 2.712907075881958 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01154087, + "balance_loss_mlp": 1.12950587, + "epoch": 0.06540977298961138, + "flos": 497398800384.0, + "grad_norm": 0.06396927661276222, + "language_loss": 1.04641414, + "learning_rate": 0.000996717581394126, + "loss": 1.05795503, + "num_input_tokens_seen": 27609888, + "router_z_loss_mlp": 0.24584961, + "step": 340, + "time_per_iteration": 2.5783133506774902 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01168509, + "balance_loss_mlp": 1.14584756, + "epoch": 0.06560215467487496, + "flos": 542613855744.0, + "grad_norm": 0.07568553531769329, + "language_loss": 1.05092287, + "learning_rate": 0.000996681845600459, + "loss": 1.062608, + "num_input_tokens_seen": 27683936, + "router_z_loss_mlp": 0.2265625, + "step": 341, + "time_per_iteration": 2.6543757915496826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115357, + "balance_loss_mlp": 1.13118291, + "epoch": 0.06579453636013852, + "flos": 413230961664.0, + "grad_norm": 0.06593832485574395, + "language_loss": 0.97027373, + "learning_rate": 0.0009966459169777982, + "loss": 0.9818095, + "num_input_tokens_seen": 27747840, + "router_z_loss_mlp": 0.22387695, + "step": 342, + "time_per_iteration": 2.5120761394500732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141132, + "balance_loss_mlp": 1.11848283, + "epoch": 0.06598691804540208, + "flos": 560354457600.0, + "grad_norm": 0.055115078659976495, + "language_loss": 1.05281377, + "learning_rate": 0.0009966097955400924, + "loss": 1.0642252, + "num_input_tokens_seen": 27819728, + "router_z_loss_mlp": 0.22644043, + "step": 343, + "time_per_iteration": 2.6954751014709473 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111133, + "balance_loss_mlp": 1.08904982, + "epoch": 0.06617929973066564, + "flos": 571789117440.0, + "grad_norm": 0.06176008438986438, + "language_loss": 0.99064481, + "learning_rate": 0.0009965734813013652, + "loss": 1.00175822, + "num_input_tokens_seen": 27893536, + "router_z_loss_mlp": 0.22277832, + "step": 344, + "time_per_iteration": 2.8235929012298584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090293, + "balance_loss_mlp": 1.06726193, + "epoch": 0.06637168141592921, + "flos": 490234470912.0, + "grad_norm": 0.05365164831273283, + "language_loss": 1.01308548, + "learning_rate": 0.0009965369742757151, + "loss": 1.02398837, + "num_input_tokens_seen": 27960976, + "router_z_loss_mlp": 0.23022461, + "step": 345, + "time_per_iteration": 2.5708556175231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078086, + "balance_loss_mlp": 1.05727243, + "epoch": 0.06656406310119277, + "flos": 1078735656960.0, + "grad_norm": 0.04968829319439664, + "language_loss": 0.97902787, + "learning_rate": 0.0009965002744773152, + "loss": 0.98980874, + "num_input_tokens_seen": 28050864, + "router_z_loss_mlp": 0.20812988, + "step": 346, + "time_per_iteration": 3.4984121322631836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086945, + "balance_loss_mlp": 1.06450987, + "epoch": 0.06675644478645633, + "flos": 513421065216.0, + "grad_norm": 0.06258978415695335, + "language_loss": 0.95138037, + "learning_rate": 0.0009964633819204139, + "loss": 0.96224982, + "num_input_tokens_seen": 28122448, + "router_z_loss_mlp": 0.22436523, + "step": 347, + "time_per_iteration": 2.6866109371185303 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01752926, + "balance_loss_mlp": 1.73108697, + "epoch": 0.06694882647171989, + "flos": 1446359943168.0, + "grad_norm": 0.11694655230354783, + "language_loss": 0.81801116, + "learning_rate": 0.0009964262966193338, + "loss": 0.83554041, + "num_input_tokens_seen": 28350352, + "router_z_loss_mlp": 0.21875, + "step": 348, + "time_per_iteration": 4.935550928115845 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01583198, + "balance_loss_mlp": 1.56116796, + "epoch": 0.06714120815698346, + "flos": 1551230784000.0, + "grad_norm": 0.0989027294649474, + "language_loss": 0.75153887, + "learning_rate": 0.000996389018588473, + "loss": 0.76737082, + "num_input_tokens_seen": 28585584, + "router_z_loss_mlp": 0.22070312, + "step": 349, + "time_per_iteration": 4.891008615493774 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149858, + "balance_loss_mlp": 1.12826955, + "epoch": 0.06733358984224702, + "flos": 879689673216.0, + "grad_norm": 0.07075764146586616, + "language_loss": 0.94920838, + "learning_rate": 0.000996351547842304, + "loss": 0.96070701, + "num_input_tokens_seen": 28672512, + "router_z_loss_mlp": 0.21582031, + "step": 350, + "time_per_iteration": 3.156322717666626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01192552, + "balance_loss_mlp": 1.17055774, + "epoch": 0.06752597152751058, + "flos": 518654651904.0, + "grad_norm": 0.09040238598346795, + "language_loss": 0.93423587, + "learning_rate": 0.0009963138843953744, + "loss": 0.94616139, + "num_input_tokens_seen": 28741520, + "router_z_loss_mlp": 0.2199707, + "step": 351, + "time_per_iteration": 2.610987663269043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01206077, + "balance_loss_mlp": 1.18405879, + "epoch": 0.06771835321277414, + "flos": 539366266368.0, + "grad_norm": 0.08658544591035036, + "language_loss": 0.97852194, + "learning_rate": 0.000996276028262306, + "loss": 0.9905827, + "num_input_tokens_seen": 28814912, + "router_z_loss_mlp": 0.22021484, + "step": 352, + "time_per_iteration": 2.8413686752319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01166048, + "balance_loss_mlp": 1.14382768, + "epoch": 0.0679107348980377, + "flos": 460430604288.0, + "grad_norm": 0.09117082479319542, + "language_loss": 1.04269946, + "learning_rate": 0.0009962379794577964, + "loss": 1.05435991, + "num_input_tokens_seen": 28882192, + "router_z_loss_mlp": 0.22216797, + "step": 353, + "time_per_iteration": 2.591372489929199 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114388, + "balance_loss_mlp": 1.12227976, + "epoch": 0.06810311658330127, + "flos": 635601752064.0, + "grad_norm": 0.05781909345233015, + "language_loss": 0.94169199, + "learning_rate": 0.000996199737996617, + "loss": 0.95313084, + "num_input_tokens_seen": 28968576, + "router_z_loss_mlp": 0.21630859, + "step": 354, + "time_per_iteration": 2.9088492393493652 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125411, + "balance_loss_mlp": 1.10420346, + "epoch": 0.06829549826856483, + "flos": 464443448832.0, + "grad_norm": 0.06770201052263504, + "language_loss": 1.03043509, + "learning_rate": 0.0009961613038936149, + "loss": 1.04168916, + "num_input_tokens_seen": 29036160, + "router_z_loss_mlp": 0.2121582, + "step": 355, + "time_per_iteration": 2.571904420852661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110613, + "balance_loss_mlp": 1.08917904, + "epoch": 0.06848787995382839, + "flos": 634335929856.0, + "grad_norm": 0.06097004840688574, + "language_loss": 0.95565176, + "learning_rate": 0.000996122677163711, + "loss": 0.96675789, + "num_input_tokens_seen": 29112048, + "router_z_loss_mlp": 0.21435547, + "step": 356, + "time_per_iteration": 2.794982671737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107296, + "balance_loss_mlp": 1.08667266, + "epoch": 0.06868026163909195, + "flos": 806023913472.0, + "grad_norm": 0.08020973782133771, + "language_loss": 1.01095176, + "learning_rate": 0.000996083857821902, + "loss": 1.02202487, + "num_input_tokens_seen": 29190960, + "router_z_loss_mlp": 0.20629883, + "step": 357, + "time_per_iteration": 3.007086753845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101637, + "balance_loss_mlp": 1.08076346, + "epoch": 0.06887264332435553, + "flos": 438997252608.0, + "grad_norm": 0.08125476198078858, + "language_loss": 0.99797714, + "learning_rate": 0.0009960448458832588, + "loss": 1.00899351, + "num_input_tokens_seen": 29262832, + "router_z_loss_mlp": 0.2088623, + "step": 358, + "time_per_iteration": 2.699530601501465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098146, + "balance_loss_mlp": 1.07872701, + "epoch": 0.06906502500961909, + "flos": 484513463808.0, + "grad_norm": 0.06827746260367892, + "language_loss": 0.99188638, + "learning_rate": 0.000996005641362927, + "loss": 1.00286782, + "num_input_tokens_seen": 29329552, + "router_z_loss_mlp": 0.1940918, + "step": 359, + "time_per_iteration": 2.5541014671325684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103345, + "balance_loss_mlp": 1.0841639, + "epoch": 0.06925740669488265, + "flos": 733293706752.0, + "grad_norm": 0.08731085845928575, + "language_loss": 1.02303529, + "learning_rate": 0.0009959662442761274, + "loss": 1.0340687, + "num_input_tokens_seen": 29410784, + "router_z_loss_mlp": 0.19189453, + "step": 360, + "time_per_iteration": 2.906623363494873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093844, + "balance_loss_mlp": 1.07268476, + "epoch": 0.0694497883801462, + "flos": 552127947264.0, + "grad_norm": 0.06697663210144707, + "language_loss": 0.9595629, + "learning_rate": 0.000995926654638155, + "loss": 0.97050136, + "num_input_tokens_seen": 29486992, + "router_z_loss_mlp": 0.21179199, + "step": 361, + "time_per_iteration": 2.793663501739502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082773, + "balance_loss_mlp": 1.06236482, + "epoch": 0.06964217006540978, + "flos": 677708992512.0, + "grad_norm": 0.06860924301964295, + "language_loss": 0.98198265, + "learning_rate": 0.00099588687246438, + "loss": 0.99281037, + "num_input_tokens_seen": 29557232, + "router_z_loss_mlp": 0.20410156, + "step": 362, + "time_per_iteration": 2.828139305114746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085274, + "balance_loss_mlp": 1.06330371, + "epoch": 0.06983455175067334, + "flos": 523987163136.0, + "grad_norm": 0.08747541291209461, + "language_loss": 1.04803789, + "learning_rate": 0.0009958468977702471, + "loss": 1.0588907, + "num_input_tokens_seen": 29625344, + "router_z_loss_mlp": 0.21972656, + "step": 363, + "time_per_iteration": 2.5759966373443604 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.02224374, + "balance_loss_mlp": 2.20682669, + "epoch": 0.0700269334359369, + "flos": 1575943658496.0, + "grad_norm": 0.2746548069890379, + "language_loss": 0.79734707, + "learning_rate": 0.0009958067305712761, + "loss": 0.81959081, + "num_input_tokens_seen": 29843664, + "router_z_loss_mlp": 0.17578125, + "step": 364, + "time_per_iteration": 4.782835245132446 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134514, + "balance_loss_mlp": 1.11340213, + "epoch": 0.07021931512120046, + "flos": 1012848274944.0, + "grad_norm": 0.08586169827549085, + "language_loss": 0.93286598, + "learning_rate": 0.0009957663708830612, + "loss": 0.94421113, + "num_input_tokens_seen": 29927152, + "router_z_loss_mlp": 0.21105957, + "step": 365, + "time_per_iteration": 3.2484283447265625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116189, + "balance_loss_mlp": 1.13884652, + "epoch": 0.07041169680646403, + "flos": 822622348800.0, + "grad_norm": 0.09941073368395695, + "language_loss": 0.97043455, + "learning_rate": 0.0009957258187212714, + "loss": 0.98205346, + "num_input_tokens_seen": 30004928, + "router_z_loss_mlp": 0.23034668, + "step": 366, + "time_per_iteration": 3.009479522705078 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01756688, + "balance_loss_mlp": 1.7255981, + "epoch": 0.07060407849172759, + "flos": 1413670993920.0, + "grad_norm": 0.12374795181042475, + "language_loss": 0.79194862, + "learning_rate": 0.0009956850741016502, + "loss": 0.80951542, + "num_input_tokens_seen": 30230256, + "router_z_loss_mlp": 0.31054688, + "step": 367, + "time_per_iteration": 4.82874608039856 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152073, + "balance_loss_mlp": 1.13087749, + "epoch": 0.07079646017699115, + "flos": 512652837888.0, + "grad_norm": 0.06786716904588838, + "language_loss": 0.93450886, + "learning_rate": 0.0009956441370400167, + "loss": 0.94602954, + "num_input_tokens_seen": 30301200, + "router_z_loss_mlp": 0.21191406, + "step": 368, + "time_per_iteration": 2.6226603984832764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153965, + "balance_loss_mlp": 1.13158989, + "epoch": 0.07098884186225471, + "flos": 540240772608.0, + "grad_norm": 0.08343626294497461, + "language_loss": 0.99467343, + "learning_rate": 0.0009956030075522636, + "loss": 1.00621307, + "num_input_tokens_seen": 30377024, + "router_z_loss_mlp": 0.22375488, + "step": 369, + "time_per_iteration": 2.7128794193267822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142856, + "balance_loss_mlp": 1.12137485, + "epoch": 0.07118122354751828, + "flos": 548419230720.0, + "grad_norm": 0.07464528715750075, + "language_loss": 0.98955953, + "learning_rate": 0.0009955616856543587, + "loss": 1.00098813, + "num_input_tokens_seen": 30448896, + "router_z_loss_mlp": 0.21472168, + "step": 370, + "time_per_iteration": 2.613138198852539 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118555, + "balance_loss_mlp": 1.0958215, + "epoch": 0.07137360523278184, + "flos": 620612554752.0, + "grad_norm": 0.056434914921328155, + "language_loss": 0.91880834, + "learning_rate": 0.0009955201713623448, + "loss": 0.92999387, + "num_input_tokens_seen": 30523584, + "router_z_loss_mlp": 0.22717285, + "step": 371, + "time_per_iteration": 2.747133255004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01746336, + "balance_loss_mlp": 1.72154021, + "epoch": 0.0715659869180454, + "flos": 1501850115072.0, + "grad_norm": 0.08669176596007007, + "language_loss": 0.76672721, + "learning_rate": 0.000995478464692339, + "loss": 0.78419054, + "num_input_tokens_seen": 30757920, + "router_z_loss_mlp": 0.24707031, + "step": 372, + "time_per_iteration": 4.931428670883179 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102855, + "balance_loss_mlp": 1.08040774, + "epoch": 0.07175836860330896, + "flos": 495246887424.0, + "grad_norm": 0.07044890130803105, + "language_loss": 1.05121827, + "learning_rate": 0.0009954365656605333, + "loss": 1.06224692, + "num_input_tokens_seen": 30824960, + "router_z_loss_mlp": 0.22436523, + "step": 373, + "time_per_iteration": 2.550243616104126 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118244, + "balance_loss_mlp": 1.09438992, + "epoch": 0.07195075028857253, + "flos": 785387902464.0, + "grad_norm": 0.05415547127036835, + "language_loss": 0.98150015, + "learning_rate": 0.0009953944742831947, + "loss": 0.99268264, + "num_input_tokens_seen": 30902224, + "router_z_loss_mlp": 0.23864746, + "step": 374, + "time_per_iteration": 2.9659459590911865 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125507, + "balance_loss_mlp": 1.10202336, + "epoch": 0.0721431319738361, + "flos": 592799067648.0, + "grad_norm": 0.07003669353380264, + "language_loss": 1.01441097, + "learning_rate": 0.0009953521905766642, + "loss": 1.02566612, + "num_input_tokens_seen": 30984784, + "router_z_loss_mlp": 0.23486328, + "step": 375, + "time_per_iteration": 2.942763566970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117119, + "balance_loss_mlp": 1.09393334, + "epoch": 0.07233551365909965, + "flos": 547981272576.0, + "grad_norm": 0.06343477824222313, + "language_loss": 0.99901861, + "learning_rate": 0.0009953097145573577, + "loss": 1.01018989, + "num_input_tokens_seen": 31055376, + "router_z_loss_mlp": 0.23193359, + "step": 376, + "time_per_iteration": 2.6275272369384766 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113711, + "balance_loss_mlp": 1.09023869, + "epoch": 0.07252789534436321, + "flos": 957170428416.0, + "grad_norm": 0.0678891965164594, + "language_loss": 0.97798675, + "learning_rate": 0.000995267046241766, + "loss": 0.98912394, + "num_input_tokens_seen": 31144944, + "router_z_loss_mlp": 0.23474121, + "step": 377, + "time_per_iteration": 3.2014975547790527 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096997, + "balance_loss_mlp": 1.07496762, + "epoch": 0.07272027702962677, + "flos": 507398902272.0, + "grad_norm": 0.0806519998399971, + "language_loss": 0.97275257, + "learning_rate": 0.0009952241856464547, + "loss": 0.98372257, + "num_input_tokens_seen": 31213392, + "router_z_loss_mlp": 0.22045898, + "step": 378, + "time_per_iteration": 2.6189732551574707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109641, + "balance_loss_mlp": 1.0746069, + "epoch": 0.07291265871489035, + "flos": 612128558592.0, + "grad_norm": 0.0691049335661606, + "language_loss": 1.04592681, + "learning_rate": 0.0009951811327880632, + "loss": 1.05689096, + "num_input_tokens_seen": 31289840, + "router_z_loss_mlp": 0.21826172, + "step": 379, + "time_per_iteration": 2.7411558628082275 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092071, + "balance_loss_mlp": 1.07025611, + "epoch": 0.0731050404001539, + "flos": 495502963200.0, + "grad_norm": 0.05765504670581196, + "language_loss": 0.97682816, + "learning_rate": 0.0009951378876833063, + "loss": 0.98774892, + "num_input_tokens_seen": 31357600, + "router_z_loss_mlp": 0.21813965, + "step": 380, + "time_per_iteration": 2.6211278438568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081575, + "balance_loss_mlp": 1.06068945, + "epoch": 0.07329742208541747, + "flos": 639677205504.0, + "grad_norm": 0.06809750593205881, + "language_loss": 1.04190159, + "learning_rate": 0.0009950944503489736, + "loss": 1.05271733, + "num_input_tokens_seen": 31428896, + "router_z_loss_mlp": 0.20898438, + "step": 381, + "time_per_iteration": 2.7533762454986572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081401, + "balance_loss_mlp": 1.0607307, + "epoch": 0.07348980377068103, + "flos": 815999284224.0, + "grad_norm": 0.06607035824886899, + "language_loss": 0.98459697, + "learning_rate": 0.0009950508208019285, + "loss": 0.99541104, + "num_input_tokens_seen": 31507424, + "router_z_loss_mlp": 0.20678711, + "step": 382, + "time_per_iteration": 2.9885637760162354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073667, + "balance_loss_mlp": 1.05369973, + "epoch": 0.0736821854559446, + "flos": 508383917568.0, + "grad_norm": 0.05970909775769663, + "language_loss": 1.02745128, + "learning_rate": 0.0009950069990591096, + "loss": 1.03818798, + "num_input_tokens_seen": 31576768, + "router_z_loss_mlp": 0.19958496, + "step": 383, + "time_per_iteration": 2.6111788749694824 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01835936, + "balance_loss_mlp": 1.8101871, + "epoch": 0.07387456714120816, + "flos": 1553801716224.0, + "grad_norm": 0.167122487372618, + "language_loss": 0.76401371, + "learning_rate": 0.0009949629851375302, + "loss": 0.78237301, + "num_input_tokens_seen": 31797312, + "router_z_loss_mlp": 0.2578125, + "step": 384, + "time_per_iteration": 4.859915494918823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116619, + "balance_loss_mlp": 1.09575748, + "epoch": 0.07406694882647172, + "flos": 525219489792.0, + "grad_norm": 0.0799084124695288, + "language_loss": 0.96017051, + "learning_rate": 0.0009949187790542777, + "loss": 0.97133672, + "num_input_tokens_seen": 31869568, + "router_z_loss_mlp": 0.20861816, + "step": 385, + "time_per_iteration": 2.6976191997528076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124239, + "balance_loss_mlp": 1.10322285, + "epoch": 0.07425933051173528, + "flos": 497468611584.0, + "grad_norm": 0.08753491640442414, + "language_loss": 0.91745877, + "learning_rate": 0.0009948743808265148, + "loss": 0.92870116, + "num_input_tokens_seen": 31941712, + "router_z_loss_mlp": 0.21020508, + "step": 386, + "time_per_iteration": 2.6870572566986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113476, + "balance_loss_mlp": 1.09249496, + "epoch": 0.07445171219699885, + "flos": 504740630016.0, + "grad_norm": 0.05063210924529089, + "language_loss": 1.0156467, + "learning_rate": 0.0009948297904714782, + "loss": 1.02678132, + "num_input_tokens_seen": 32015232, + "router_z_loss_mlp": 0.20996094, + "step": 387, + "time_per_iteration": 2.668027639389038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097529, + "balance_loss_mlp": 1.07642913, + "epoch": 0.07464409388226241, + "flos": 553693515264.0, + "grad_norm": 0.06830922509793466, + "language_loss": 0.93493366, + "learning_rate": 0.0009947850080064796, + "loss": 0.9459089, + "num_input_tokens_seen": 32094640, + "router_z_loss_mlp": 0.21105957, + "step": 388, + "time_per_iteration": 2.79836106300354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098078, + "balance_loss_mlp": 1.07695365, + "epoch": 0.07483647556752597, + "flos": 776511028224.0, + "grad_norm": 0.06471398355705121, + "language_loss": 0.98276728, + "learning_rate": 0.0009947400334489047, + "loss": 0.99374807, + "num_input_tokens_seen": 32176640, + "router_z_loss_mlp": 0.21130371, + "step": 389, + "time_per_iteration": 3.0046355724334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095267, + "balance_loss_mlp": 1.07513261, + "epoch": 0.07502885725278953, + "flos": 612256596480.0, + "grad_norm": 0.0754939105077014, + "language_loss": 0.90272582, + "learning_rate": 0.0009946948668162145, + "loss": 0.91367853, + "num_input_tokens_seen": 32246704, + "router_z_loss_mlp": 0.20141602, + "step": 390, + "time_per_iteration": 2.724792003631592 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091157, + "balance_loss_mlp": 1.06946135, + "epoch": 0.0752212389380531, + "flos": 688324552704.0, + "grad_norm": 0.05626120625508035, + "language_loss": 0.9463594, + "learning_rate": 0.0009946495081259441, + "loss": 0.95727098, + "num_input_tokens_seen": 32320032, + "router_z_loss_mlp": 0.21704102, + "step": 391, + "time_per_iteration": 2.8221397399902344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101684, + "balance_loss_mlp": 1.08008361, + "epoch": 0.07541362062331666, + "flos": 765362967552.0, + "grad_norm": 0.09729902751759628, + "language_loss": 0.97468722, + "learning_rate": 0.0009946039573957035, + "loss": 0.98570406, + "num_input_tokens_seen": 32398144, + "router_z_loss_mlp": 0.21606445, + "step": 392, + "time_per_iteration": 2.958655595779419 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095785, + "balance_loss_mlp": 1.07572174, + "epoch": 0.07560600230858022, + "flos": 588460336128.0, + "grad_norm": 0.06468718689622391, + "language_loss": 0.94257009, + "learning_rate": 0.000994558214643177, + "loss": 0.95352793, + "num_input_tokens_seen": 32471984, + "router_z_loss_mlp": 0.20056152, + "step": 393, + "time_per_iteration": 2.752979040145874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101382, + "balance_loss_mlp": 1.08086586, + "epoch": 0.07579838399384378, + "flos": 749508028416.0, + "grad_norm": 0.06635223139616171, + "language_loss": 0.961483, + "learning_rate": 0.000994512279886123, + "loss": 0.97249681, + "num_input_tokens_seen": 32550176, + "router_z_loss_mlp": 0.20532227, + "step": 394, + "time_per_iteration": 3.055225133895874 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104661, + "balance_loss_mlp": 1.08346581, + "epoch": 0.07599076567910736, + "flos": 523185440256.0, + "grad_norm": 0.06901630142642712, + "language_loss": 0.96749192, + "learning_rate": 0.0009944661531423758, + "loss": 0.97853857, + "num_input_tokens_seen": 32620768, + "router_z_loss_mlp": 0.2121582, + "step": 395, + "time_per_iteration": 2.6922085285186768 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093271, + "balance_loss_mlp": 1.07248056, + "epoch": 0.07618314736437092, + "flos": 550812662784.0, + "grad_norm": 0.07064334209039194, + "language_loss": 0.95375401, + "learning_rate": 0.000994419834429843, + "loss": 0.96468663, + "num_input_tokens_seen": 32693472, + "router_z_loss_mlp": 0.20788574, + "step": 396, + "time_per_iteration": 2.6657333374023438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092352, + "balance_loss_mlp": 1.0716933, + "epoch": 0.07637552904963447, + "flos": 697901253120.0, + "grad_norm": 0.07324881108467876, + "language_loss": 0.99580455, + "learning_rate": 0.0009943733237665069, + "loss": 1.00672793, + "num_input_tokens_seen": 32764976, + "router_z_loss_mlp": 0.20654297, + "step": 397, + "time_per_iteration": 2.8662500381469727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085112, + "balance_loss_mlp": 1.06454849, + "epoch": 0.07656791073489803, + "flos": 579066928128.0, + "grad_norm": 0.04790317238997088, + "language_loss": 0.98118353, + "learning_rate": 0.0009943266211704248, + "loss": 0.99203461, + "num_input_tokens_seen": 32853104, + "router_z_loss_mlp": 0.20568848, + "step": 398, + "time_per_iteration": 2.930741786956787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094784, + "balance_loss_mlp": 1.07348132, + "epoch": 0.0767602924201616, + "flos": 416923711488.0, + "grad_norm": 0.09980331544781734, + "language_loss": 1.00422275, + "learning_rate": 0.000994279726659728, + "loss": 1.01517057, + "num_input_tokens_seen": 32919376, + "router_z_loss_mlp": 0.21325684, + "step": 399, + "time_per_iteration": 2.533738851547241 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109036, + "balance_loss_mlp": 1.06970143, + "epoch": 0.07695267410542517, + "flos": 482671471104.0, + "grad_norm": 0.06967700921129397, + "language_loss": 0.97985041, + "learning_rate": 0.0009942326402526231, + "loss": 0.99075395, + "num_input_tokens_seen": 32988064, + "router_z_loss_mlp": 0.20666504, + "step": 400, + "time_per_iteration": 2.51460337638855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096542, + "balance_loss_mlp": 1.07526302, + "epoch": 0.07714505579068873, + "flos": 530742647808.0, + "grad_norm": 0.052652305799428985, + "language_loss": 0.96639109, + "learning_rate": 0.0009941853619673902, + "loss": 0.97735649, + "num_input_tokens_seen": 33059024, + "router_z_loss_mlp": 0.2130127, + "step": 401, + "time_per_iteration": 2.620939016342163 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101036, + "balance_loss_mlp": 1.08012676, + "epoch": 0.07733743747595229, + "flos": 804635845632.0, + "grad_norm": 0.07273299487754427, + "language_loss": 0.99959278, + "learning_rate": 0.0009941378918223844, + "loss": 1.01060319, + "num_input_tokens_seen": 33137712, + "router_z_loss_mlp": 0.20910645, + "step": 402, + "time_per_iteration": 3.036839008331299 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110477, + "balance_loss_mlp": 1.08423018, + "epoch": 0.07752981916121585, + "flos": 622192679424.0, + "grad_norm": 0.05767312217272775, + "language_loss": 0.93044209, + "learning_rate": 0.0009940902298360354, + "loss": 0.94148982, + "num_input_tokens_seen": 33211296, + "router_z_loss_mlp": 0.20544434, + "step": 403, + "time_per_iteration": 2.7703943252563477 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097477, + "balance_loss_mlp": 1.07694876, + "epoch": 0.07772220084647942, + "flos": 727961195520.0, + "grad_norm": 0.0686344305115436, + "language_loss": 1.02037048, + "learning_rate": 0.0009940423760268473, + "loss": 1.03134525, + "num_input_tokens_seen": 33283632, + "router_z_loss_mlp": 0.2052002, + "step": 404, + "time_per_iteration": 2.8823602199554443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085621, + "balance_loss_mlp": 1.06497431, + "epoch": 0.07791458253174298, + "flos": 555149984256.0, + "grad_norm": 0.10727031409308073, + "language_loss": 0.96142864, + "learning_rate": 0.0009939943304133982, + "loss": 0.97228479, + "num_input_tokens_seen": 33350704, + "router_z_loss_mlp": 0.20654297, + "step": 405, + "time_per_iteration": 2.63908314704895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078944, + "balance_loss_mlp": 1.05944133, + "epoch": 0.07810696421700654, + "flos": 552919495680.0, + "grad_norm": 0.08981509362846728, + "language_loss": 1.0302707, + "learning_rate": 0.0009939460930143416, + "loss": 1.04106021, + "num_input_tokens_seen": 33416272, + "router_z_loss_mlp": 0.19482422, + "step": 406, + "time_per_iteration": 2.63259220123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079269, + "balance_loss_mlp": 1.05927801, + "epoch": 0.0782993459022701, + "flos": 650323289088.0, + "grad_norm": 0.07212254231156982, + "language_loss": 0.96910775, + "learning_rate": 0.0009938976638484043, + "loss": 0.97990054, + "num_input_tokens_seen": 33501824, + "router_z_loss_mlp": 0.1998291, + "step": 407, + "time_per_iteration": 2.9489452838897705 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073065, + "balance_loss_mlp": 1.05239439, + "epoch": 0.07849172758753367, + "flos": 495926364672.0, + "grad_norm": 0.07302041560946317, + "language_loss": 0.9619081, + "learning_rate": 0.0009938490429343887, + "loss": 0.97263873, + "num_input_tokens_seen": 33571456, + "router_z_loss_mlp": 0.20666504, + "step": 408, + "time_per_iteration": 2.541293144226074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078297, + "balance_loss_mlp": 1.05823374, + "epoch": 0.07868410927279723, + "flos": 577696389120.0, + "grad_norm": 0.06961121210328268, + "language_loss": 0.96404505, + "learning_rate": 0.0009938002302911709, + "loss": 0.974828, + "num_input_tokens_seen": 33646320, + "router_z_loss_mlp": 0.20056152, + "step": 409, + "time_per_iteration": 2.7890634536743164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078628, + "balance_loss_mlp": 1.05869615, + "epoch": 0.07887649095806079, + "flos": 522698019840.0, + "grad_norm": 0.10283598941623227, + "language_loss": 0.99080813, + "learning_rate": 0.0009937512259377015, + "loss": 1.00159442, + "num_input_tokens_seen": 33717664, + "router_z_loss_mlp": 0.19921875, + "step": 410, + "time_per_iteration": 2.6631360054016113 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076374, + "balance_loss_mlp": 1.05739617, + "epoch": 0.07906887264332435, + "flos": 556958481408.0, + "grad_norm": 0.07518465865945036, + "language_loss": 0.97744381, + "learning_rate": 0.000993702029893006, + "loss": 0.98820746, + "num_input_tokens_seen": 33794720, + "router_z_loss_mlp": 0.18981934, + "step": 411, + "time_per_iteration": 2.762937068939209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070708, + "balance_loss_mlp": 1.0512886, + "epoch": 0.07926125432858792, + "flos": 821641715712.0, + "grad_norm": 0.06547583340109177, + "language_loss": 0.97466588, + "learning_rate": 0.0009936526421761838, + "loss": 0.98537302, + "num_input_tokens_seen": 33868304, + "router_z_loss_mlp": 0.1940918, + "step": 412, + "time_per_iteration": 3.019529342651367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01070741, + "balance_loss_mlp": 1.05210841, + "epoch": 0.07945363601385148, + "flos": 562072794624.0, + "grad_norm": 0.06412617323579047, + "language_loss": 0.9993977, + "learning_rate": 0.000993603062806409, + "loss": 1.01010513, + "num_input_tokens_seen": 33937424, + "router_z_loss_mlp": 0.18615723, + "step": 413, + "time_per_iteration": 2.667893409729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078833, + "balance_loss_mlp": 1.05879402, + "epoch": 0.07964601769911504, + "flos": 517615792128.0, + "grad_norm": 0.0777298152120257, + "language_loss": 1.03187037, + "learning_rate": 0.0009935532918029298, + "loss": 1.04265857, + "num_input_tokens_seen": 34003984, + "router_z_loss_mlp": 0.20031738, + "step": 414, + "time_per_iteration": 2.628847122192383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079604, + "balance_loss_mlp": 1.06020916, + "epoch": 0.0798383993843786, + "flos": 538956011520.0, + "grad_norm": 0.0762846382616791, + "language_loss": 0.96381676, + "learning_rate": 0.0009935033291850694, + "loss": 0.97461283, + "num_input_tokens_seen": 34072400, + "router_z_loss_mlp": 0.19384766, + "step": 415, + "time_per_iteration": 2.6874804496765137 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078311, + "balance_loss_mlp": 1.05915451, + "epoch": 0.08003078106964218, + "flos": 484901959680.0, + "grad_norm": 0.07548152614126195, + "language_loss": 0.9874112, + "learning_rate": 0.0009934531749722247, + "loss": 0.9981944, + "num_input_tokens_seen": 34142448, + "router_z_loss_mlp": 0.19177246, + "step": 416, + "time_per_iteration": 2.5752930641174316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077721, + "balance_loss_mlp": 1.0581702, + "epoch": 0.08022316275490574, + "flos": 517999905792.0, + "grad_norm": 0.07373378819853486, + "language_loss": 0.97326815, + "learning_rate": 0.0009934028291838672, + "loss": 0.98404539, + "num_input_tokens_seen": 34214080, + "router_z_loss_mlp": 0.1953125, + "step": 417, + "time_per_iteration": 2.715142011642456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078885, + "balance_loss_mlp": 1.0593344, + "epoch": 0.0804155444401693, + "flos": 493755512832.0, + "grad_norm": 0.06878732968267398, + "language_loss": 0.9290086, + "learning_rate": 0.0009933522918395433, + "loss": 0.93979746, + "num_input_tokens_seen": 34288448, + "router_z_loss_mlp": 0.19555664, + "step": 418, + "time_per_iteration": 2.7008063793182373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01673141, + "balance_loss_mlp": 1.6505394, + "epoch": 0.08060792612543285, + "flos": 1580567579136.0, + "grad_norm": 0.10865535097535944, + "language_loss": 0.782511, + "learning_rate": 0.0009933015629588731, + "loss": 0.7992425, + "num_input_tokens_seen": 34521632, + "router_z_loss_mlp": 0.22558594, + "step": 419, + "time_per_iteration": 4.854820728302002 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092516, + "balance_loss_mlp": 1.07238102, + "epoch": 0.08080030781069643, + "flos": 525090041856.0, + "grad_norm": 0.07888672823303539, + "language_loss": 1.11010027, + "learning_rate": 0.000993250642561551, + "loss": 1.12102532, + "num_input_tokens_seen": 34590080, + "router_z_loss_mlp": 0.20129395, + "step": 420, + "time_per_iteration": 2.6152822971343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102725, + "balance_loss_mlp": 1.08251905, + "epoch": 0.08099268949595999, + "flos": 546459374592.0, + "grad_norm": 0.06927423279576624, + "language_loss": 0.96781242, + "learning_rate": 0.0009931995306673466, + "loss": 0.97883964, + "num_input_tokens_seen": 34660512, + "router_z_loss_mlp": 0.20202637, + "step": 421, + "time_per_iteration": 2.8378820419311523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107938, + "balance_loss_mlp": 1.08725524, + "epoch": 0.08118507118122355, + "flos": 510116811264.0, + "grad_norm": 0.07245841989657228, + "language_loss": 1.01691484, + "learning_rate": 0.000993148227296103, + "loss": 1.02799416, + "num_input_tokens_seen": 34732016, + "router_z_loss_mlp": 0.20678711, + "step": 422, + "time_per_iteration": 2.6234657764434814 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109153, + "balance_loss_mlp": 1.08827925, + "epoch": 0.08137745286648711, + "flos": 720339969024.0, + "grad_norm": 0.06440268991377437, + "language_loss": 0.90059143, + "learning_rate": 0.000993096732467738, + "loss": 0.91168296, + "num_input_tokens_seen": 34810416, + "router_z_loss_mlp": 0.2088623, + "step": 423, + "time_per_iteration": 2.9789979457855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107042, + "balance_loss_mlp": 1.08620405, + "epoch": 0.08156983455175067, + "flos": 679313848320.0, + "grad_norm": 0.09430690436493987, + "language_loss": 0.97591221, + "learning_rate": 0.0009930450462022435, + "loss": 0.9869827, + "num_input_tokens_seen": 34879504, + "router_z_loss_mlp": 0.20837402, + "step": 424, + "time_per_iteration": 2.7870407104492188 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01731933, + "balance_loss_mlp": 1.70847309, + "epoch": 0.08176221623701424, + "flos": 1452577135104.0, + "grad_norm": 0.13164555017172178, + "language_loss": 0.79189807, + "learning_rate": 0.0009929931685196862, + "loss": 0.80921739, + "num_input_tokens_seen": 35111584, + "router_z_loss_mlp": 0.234375, + "step": 425, + "time_per_iteration": 4.870323181152344 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095108, + "balance_loss_mlp": 1.07456827, + "epoch": 0.0819545979222778, + "flos": 1556034071040.0, + "grad_norm": 0.10298759083167684, + "language_loss": 0.95328236, + "learning_rate": 0.0009929410994402065, + "loss": 0.9642334, + "num_input_tokens_seen": 35205664, + "router_z_loss_mlp": 0.20544434, + "step": 426, + "time_per_iteration": 3.7942585945129395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093366, + "balance_loss_mlp": 1.07214665, + "epoch": 0.08214697960754136, + "flos": 512456398848.0, + "grad_norm": 0.069672302328133, + "language_loss": 0.99507213, + "learning_rate": 0.0009928888389840196, + "loss": 1.00600576, + "num_input_tokens_seen": 35280144, + "router_z_loss_mlp": 0.21240234, + "step": 427, + "time_per_iteration": 2.684760093688965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01073876, + "balance_loss_mlp": 1.05376494, + "epoch": 0.08233936129280492, + "flos": 594850646016.0, + "grad_norm": 0.07796900075206671, + "language_loss": 1.01749206, + "learning_rate": 0.0009928363871714147, + "loss": 1.02823079, + "num_input_tokens_seen": 35344768, + "router_z_loss_mlp": 0.20092773, + "step": 428, + "time_per_iteration": 2.6608195304870605 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01078126, + "balance_loss_mlp": 1.05796742, + "epoch": 0.08253174297806849, + "flos": 571758594048.0, + "grad_norm": 0.07341701057973313, + "language_loss": 0.95524251, + "learning_rate": 0.0009927837440227556, + "loss": 0.96602374, + "num_input_tokens_seen": 35425536, + "router_z_loss_mlp": 0.20153809, + "step": 429, + "time_per_iteration": 2.824958324432373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083273, + "balance_loss_mlp": 1.06413972, + "epoch": 0.08272412466333205, + "flos": 623065623552.0, + "grad_norm": 0.06194570532237157, + "language_loss": 0.90308964, + "learning_rate": 0.0009927309095584798, + "loss": 0.91392243, + "num_input_tokens_seen": 35515440, + "router_z_loss_mlp": 0.19128418, + "step": 430, + "time_per_iteration": 2.9565205574035645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105878, + "balance_loss_mlp": 1.08643484, + "epoch": 0.08291650634859561, + "flos": 513745542144.0, + "grad_norm": 0.09375416706629437, + "language_loss": 1.0225904, + "learning_rate": 0.0009926778837991, + "loss": 1.03364921, + "num_input_tokens_seen": 35580192, + "router_z_loss_mlp": 0.19433594, + "step": 431, + "time_per_iteration": 2.5606777667999268 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104802, + "balance_loss_mlp": 1.08521628, + "epoch": 0.08310888803385917, + "flos": 667073083392.0, + "grad_norm": 0.09022222071598751, + "language_loss": 1.00445497, + "learning_rate": 0.000992624666765202, + "loss": 1.01550293, + "num_input_tokens_seen": 35649472, + "router_z_loss_mlp": 0.19580078, + "step": 432, + "time_per_iteration": 2.763514995574951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112312, + "balance_loss_mlp": 1.09166527, + "epoch": 0.08330126971912274, + "flos": 582995404800.0, + "grad_norm": 0.07142121215748316, + "language_loss": 0.98131895, + "learning_rate": 0.000992571258477447, + "loss": 0.99244213, + "num_input_tokens_seen": 35722848, + "router_z_loss_mlp": 0.20654297, + "step": 433, + "time_per_iteration": 2.7823588848114014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086622, + "balance_loss_mlp": 1.06731021, + "epoch": 0.0834936514043863, + "flos": 561064458240.0, + "grad_norm": 0.06618743000622296, + "language_loss": 0.92206728, + "learning_rate": 0.0009925176589565695, + "loss": 0.93293345, + "num_input_tokens_seen": 35800944, + "router_z_loss_mlp": 0.1932373, + "step": 434, + "time_per_iteration": 2.7774362564086914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109069, + "balance_loss_mlp": 1.07043648, + "epoch": 0.08368603308964986, + "flos": 494272046592.0, + "grad_norm": 0.07800081613857189, + "language_loss": 1.01949787, + "learning_rate": 0.0009924638682233791, + "loss": 1.03040481, + "num_input_tokens_seen": 35866288, + "router_z_loss_mlp": 0.20251465, + "step": 435, + "time_per_iteration": 2.574716091156006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01236801, + "balance_loss_mlp": 1.21505737, + "epoch": 0.08387841477491342, + "flos": 1388322312192.0, + "grad_norm": 0.08820287098199171, + "language_loss": 0.79564589, + "learning_rate": 0.0009924098862987589, + "loss": 0.80801398, + "num_input_tokens_seen": 36083040, + "router_z_loss_mlp": 0.21777344, + "step": 436, + "time_per_iteration": 4.521069049835205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087939, + "balance_loss_mlp": 1.06750691, + "epoch": 0.084070796460177, + "flos": 798642796032.0, + "grad_norm": 0.09737991847895365, + "language_loss": 0.92070073, + "learning_rate": 0.0009923557132036668, + "loss": 0.93158013, + "num_input_tokens_seen": 36158816, + "router_z_loss_mlp": 0.2043457, + "step": 437, + "time_per_iteration": 3.0401971340179443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106896, + "balance_loss_mlp": 1.08635592, + "epoch": 0.08426317814544056, + "flos": 558681200640.0, + "grad_norm": 0.07082709395687636, + "language_loss": 0.96077365, + "learning_rate": 0.0009923013489591345, + "loss": 0.97184265, + "num_input_tokens_seen": 36236432, + "router_z_loss_mlp": 0.20532227, + "step": 438, + "time_per_iteration": 2.7388038635253906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138911, + "balance_loss_mlp": 1.11965871, + "epoch": 0.08445555983070412, + "flos": 810057106944.0, + "grad_norm": 0.09946092642967543, + "language_loss": 0.94659293, + "learning_rate": 0.0009922467935862681, + "loss": 0.95798206, + "num_input_tokens_seen": 36327952, + "router_z_loss_mlp": 0.19250488, + "step": 439, + "time_per_iteration": 3.0827929973602295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153103, + "balance_loss_mlp": 1.13278937, + "epoch": 0.08464794151596768, + "flos": 509939311104.0, + "grad_norm": 0.08658230076015333, + "language_loss": 0.97196984, + "learning_rate": 0.0009921920471062478, + "loss": 0.9835009, + "num_input_tokens_seen": 36394896, + "router_z_loss_mlp": 0.203125, + "step": 440, + "time_per_iteration": 2.5667247772216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109293, + "balance_loss_mlp": 1.08952785, + "epoch": 0.08484032320123125, + "flos": 556149556224.0, + "grad_norm": 0.0779492699350581, + "language_loss": 0.95526892, + "learning_rate": 0.0009921371095403281, + "loss": 0.96636182, + "num_input_tokens_seen": 36464656, + "router_z_loss_mlp": 0.19763184, + "step": 441, + "time_per_iteration": 2.6504476070404053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081558, + "balance_loss_mlp": 1.06137586, + "epoch": 0.08503270488649481, + "flos": 527103742464.0, + "grad_norm": 0.0823758421396894, + "language_loss": 0.98291612, + "learning_rate": 0.0009920819809098379, + "loss": 0.99373174, + "num_input_tokens_seen": 36532208, + "router_z_loss_mlp": 0.20166016, + "step": 442, + "time_per_iteration": 2.5884947776794434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01076633, + "balance_loss_mlp": 1.05612862, + "epoch": 0.08522508657175837, + "flos": 613989490176.0, + "grad_norm": 0.07828377396362728, + "language_loss": 0.94043314, + "learning_rate": 0.0009920266612361798, + "loss": 0.95119947, + "num_input_tokens_seen": 36607360, + "router_z_loss_mlp": 0.20507812, + "step": 443, + "time_per_iteration": 2.7464845180511475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077144, + "balance_loss_mlp": 1.05650926, + "epoch": 0.08541746825702193, + "flos": 619495119360.0, + "grad_norm": 0.07442656272719532, + "language_loss": 0.94335687, + "learning_rate": 0.0009919711505408308, + "loss": 0.95412827, + "num_input_tokens_seen": 36680688, + "router_z_loss_mlp": 0.2064209, + "step": 444, + "time_per_iteration": 2.7615623474121094 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092391, + "balance_loss_mlp": 1.07126665, + "epoch": 0.08560984994228549, + "flos": 482671471104.0, + "grad_norm": 0.08601843511227286, + "language_loss": 0.92049706, + "learning_rate": 0.000991915448845342, + "loss": 0.93142092, + "num_input_tokens_seen": 36746288, + "router_z_loss_mlp": 0.21130371, + "step": 445, + "time_per_iteration": 2.519644260406494 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103035, + "balance_loss_mlp": 1.08145857, + "epoch": 0.08580223162754906, + "flos": 516897027072.0, + "grad_norm": 0.07781715705073443, + "language_loss": 1.01207459, + "learning_rate": 0.000991859556171339, + "loss": 1.02310491, + "num_input_tokens_seen": 36812528, + "router_z_loss_mlp": 0.21569824, + "step": 446, + "time_per_iteration": 2.5678694248199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116922, + "balance_loss_mlp": 1.09462976, + "epoch": 0.08599461331281262, + "flos": 531215511552.0, + "grad_norm": 0.11213971543052093, + "language_loss": 1.02931881, + "learning_rate": 0.000991803472540521, + "loss": 1.040488, + "num_input_tokens_seen": 36879248, + "router_z_loss_mlp": 0.22302246, + "step": 447, + "time_per_iteration": 2.6309196949005127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124555, + "balance_loss_mlp": 1.10302639, + "epoch": 0.08618699499807618, + "flos": 789966743040.0, + "grad_norm": 0.07287006723198586, + "language_loss": 0.97443926, + "learning_rate": 0.0009917471979746615, + "loss": 0.98568487, + "num_input_tokens_seen": 36951376, + "router_z_loss_mlp": 0.21533203, + "step": 448, + "time_per_iteration": 2.9742491245269775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134564, + "balance_loss_mlp": 1.11266506, + "epoch": 0.08637937668333974, + "flos": 565707317760.0, + "grad_norm": 0.08202115093309782, + "language_loss": 0.97199845, + "learning_rate": 0.0009916907324956086, + "loss": 0.98334408, + "num_input_tokens_seen": 37025936, + "router_z_loss_mlp": 0.21923828, + "step": 449, + "time_per_iteration": 2.704089641571045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151497, + "balance_loss_mlp": 1.12693954, + "epoch": 0.08657175836860331, + "flos": 444930665472.0, + "grad_norm": 0.09325215593581063, + "language_loss": 0.93441564, + "learning_rate": 0.0009916340761252837, + "loss": 0.9459306, + "num_input_tokens_seen": 37095872, + "router_z_loss_mlp": 0.24536133, + "step": 450, + "time_per_iteration": 2.5866575241088867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01158359, + "balance_loss_mlp": 1.13567328, + "epoch": 0.08676414005386687, + "flos": 843789450240.0, + "grad_norm": 0.23711660967347972, + "language_loss": 0.90976942, + "learning_rate": 0.0009915772288856832, + "loss": 0.92135304, + "num_input_tokens_seen": 37179072, + "router_z_loss_mlp": 0.22668457, + "step": 451, + "time_per_iteration": 3.109010696411133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0118071, + "balance_loss_mlp": 1.15827537, + "epoch": 0.08695652173913043, + "flos": 602995608576.0, + "grad_norm": 0.08699490701012727, + "language_loss": 0.92036849, + "learning_rate": 0.000991520190798877, + "loss": 0.93217564, + "num_input_tokens_seen": 37260288, + "router_z_loss_mlp": 0.22424316, + "step": 452, + "time_per_iteration": 2.8523812294006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181191, + "balance_loss_mlp": 1.15807629, + "epoch": 0.08714890342439399, + "flos": 730423028736.0, + "grad_norm": 0.09293440668835976, + "language_loss": 1.01637089, + "learning_rate": 0.0009914629618870089, + "loss": 1.02818286, + "num_input_tokens_seen": 37331136, + "router_z_loss_mlp": 0.23095703, + "step": 453, + "time_per_iteration": 2.882887125015259 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142362, + "balance_loss_mlp": 1.12891519, + "epoch": 0.08734128510965757, + "flos": 1481518232064.0, + "grad_norm": 0.0645312523276542, + "language_loss": 0.78675872, + "learning_rate": 0.0009914055421722976, + "loss": 0.79818237, + "num_input_tokens_seen": 37559040, + "router_z_loss_mlp": 0.13476562, + "step": 454, + "time_per_iteration": 4.717878103256226 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103219, + "balance_loss_mlp": 1.09034455, + "epoch": 0.08753366679492113, + "flos": 1522214083584.0, + "grad_norm": 0.04274098512475534, + "language_loss": 0.81427962, + "learning_rate": 0.0009913479316770353, + "loss": 0.82531178, + "num_input_tokens_seen": 37785136, + "router_z_loss_mlp": 0.12890625, + "step": 455, + "time_per_iteration": 4.838243246078491 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171645, + "balance_loss_mlp": 1.14951944, + "epoch": 0.08772604848018468, + "flos": 720935078400.0, + "grad_norm": 0.10543082910841049, + "language_loss": 0.94423014, + "learning_rate": 0.0009912901304235883, + "loss": 0.95594656, + "num_input_tokens_seen": 37858832, + "router_z_loss_mlp": 0.22131348, + "step": 456, + "time_per_iteration": 2.9432015419006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150762, + "balance_loss_mlp": 1.12861252, + "epoch": 0.08791843016544824, + "flos": 707926086144.0, + "grad_norm": 0.10980567381029156, + "language_loss": 0.91300154, + "learning_rate": 0.000991232138434397, + "loss": 0.92450917, + "num_input_tokens_seen": 37931856, + "router_z_loss_mlp": 0.22143555, + "step": 457, + "time_per_iteration": 2.832761526107788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113929, + "balance_loss_mlp": 1.09195828, + "epoch": 0.08811081185071182, + "flos": 472799407104.0, + "grad_norm": 0.1324680836731367, + "language_loss": 0.97845554, + "learning_rate": 0.000991173955731976, + "loss": 0.98959482, + "num_input_tokens_seen": 38002432, + "router_z_loss_mlp": 0.21960449, + "step": 458, + "time_per_iteration": 2.660696506500244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100778, + "balance_loss_mlp": 1.07958269, + "epoch": 0.08830319353597538, + "flos": 684647769600.0, + "grad_norm": 0.07138233575581546, + "language_loss": 1.0178268, + "learning_rate": 0.0009911155823389137, + "loss": 1.02883458, + "num_input_tokens_seen": 38081648, + "router_z_loss_mlp": 0.21203613, + "step": 459, + "time_per_iteration": 2.9878122806549072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105128, + "balance_loss_mlp": 1.08344412, + "epoch": 0.08849557522123894, + "flos": 573235411968.0, + "grad_norm": 0.0735053314112025, + "language_loss": 0.9764787, + "learning_rate": 0.000991057018277873, + "loss": 0.98752999, + "num_input_tokens_seen": 38153424, + "router_z_loss_mlp": 0.21679688, + "step": 460, + "time_per_iteration": 2.707247018814087 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116963, + "balance_loss_mlp": 1.09422946, + "epoch": 0.0886879569065025, + "flos": 564303283200.0, + "grad_norm": 0.10552034142073316, + "language_loss": 0.9759655, + "learning_rate": 0.0009909982635715898, + "loss": 0.98713505, + "num_input_tokens_seen": 38223008, + "router_z_loss_mlp": 0.22729492, + "step": 461, + "time_per_iteration": 2.609016180038452 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120097, + "balance_loss_mlp": 1.09760189, + "epoch": 0.08888033859176607, + "flos": 563609249280.0, + "grad_norm": 0.09185893532484944, + "language_loss": 0.96625364, + "learning_rate": 0.0009909393182428751, + "loss": 0.97745454, + "num_input_tokens_seen": 38294592, + "router_z_loss_mlp": 0.22497559, + "step": 462, + "time_per_iteration": 2.682616949081421 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116703, + "balance_loss_mlp": 1.09437466, + "epoch": 0.08907272027702963, + "flos": 465517214208.0, + "grad_norm": 0.08888403374641002, + "language_loss": 0.91300213, + "learning_rate": 0.000990880182314614, + "loss": 0.92416912, + "num_input_tokens_seen": 38365792, + "router_z_loss_mlp": 0.22314453, + "step": 463, + "time_per_iteration": 2.732579469680786 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122985, + "balance_loss_mlp": 1.10014486, + "epoch": 0.08926510196229319, + "flos": 681200921088.0, + "grad_norm": 0.07408309604525525, + "language_loss": 0.92294347, + "learning_rate": 0.0009908208558097643, + "loss": 0.93417335, + "num_input_tokens_seen": 38447776, + "router_z_loss_mlp": 0.22839355, + "step": 464, + "time_per_iteration": 2.910313606262207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137827, + "balance_loss_mlp": 1.115273, + "epoch": 0.08945748364755675, + "flos": 596411831808.0, + "grad_norm": 0.08673846989427919, + "language_loss": 0.93827909, + "learning_rate": 0.000990761338751359, + "loss": 0.94965738, + "num_input_tokens_seen": 38521632, + "router_z_loss_mlp": 0.22546387, + "step": 465, + "time_per_iteration": 2.827570676803589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133815, + "balance_loss_mlp": 1.12222791, + "epoch": 0.08964986533282032, + "flos": 1585082400768.0, + "grad_norm": 0.06082202694548154, + "language_loss": 0.73659623, + "learning_rate": 0.0009907016311625045, + "loss": 0.74793446, + "num_input_tokens_seen": 38760528, + "router_z_loss_mlp": 0.11572266, + "step": 466, + "time_per_iteration": 4.960917234420776 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01177765, + "balance_loss_mlp": 1.15419745, + "epoch": 0.08984224701808388, + "flos": 533268499968.0, + "grad_norm": 0.4900596090566038, + "language_loss": 0.96587038, + "learning_rate": 0.0009906417330663815, + "loss": 0.97764802, + "num_input_tokens_seen": 38827200, + "router_z_loss_mlp": 0.23571777, + "step": 467, + "time_per_iteration": 2.5937299728393555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01157084, + "balance_loss_mlp": 1.13383865, + "epoch": 0.09003462870334744, + "flos": 478702296576.0, + "grad_norm": 0.08613132202477504, + "language_loss": 0.92798859, + "learning_rate": 0.0009905816444862442, + "loss": 0.93955946, + "num_input_tokens_seen": 38891984, + "router_z_loss_mlp": 0.23217773, + "step": 468, + "time_per_iteration": 2.6012237071990967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164868, + "balance_loss_mlp": 1.14150274, + "epoch": 0.090227010388611, + "flos": 653307448320.0, + "grad_norm": 0.08218040805372613, + "language_loss": 0.90769458, + "learning_rate": 0.0009905213654454216, + "loss": 0.91934329, + "num_input_tokens_seen": 38977136, + "router_z_loss_mlp": 0.23364258, + "step": 469, + "time_per_iteration": 2.8727760314941406 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01176439, + "balance_loss_mlp": 1.15152478, + "epoch": 0.09041939207387456, + "flos": 617894645760.0, + "grad_norm": 0.09256259391525869, + "language_loss": 0.97864139, + "learning_rate": 0.0009904608959673158, + "loss": 0.9904058, + "num_input_tokens_seen": 39052224, + "router_z_loss_mlp": 0.24938965, + "step": 470, + "time_per_iteration": 2.7991952896118164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01151805, + "balance_loss_mlp": 1.12671185, + "epoch": 0.09061177375913813, + "flos": 454137808896.0, + "grad_norm": 0.09693984756275055, + "language_loss": 0.97988749, + "learning_rate": 0.000990400236075403, + "loss": 0.99140555, + "num_input_tokens_seen": 39116832, + "router_z_loss_mlp": 0.25109863, + "step": 471, + "time_per_iteration": 2.523508310317993 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125394, + "balance_loss_mlp": 1.10119498, + "epoch": 0.0908041554444017, + "flos": 543982984704.0, + "grad_norm": 0.09250187628709369, + "language_loss": 0.9490509, + "learning_rate": 0.0009903393857932338, + "loss": 0.96030486, + "num_input_tokens_seen": 39190528, + "router_z_loss_mlp": 0.24194336, + "step": 472, + "time_per_iteration": 2.7065584659576416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124084, + "balance_loss_mlp": 1.09912193, + "epoch": 0.09099653712966525, + "flos": 564052999680.0, + "grad_norm": 0.10897832311722938, + "language_loss": 0.93660218, + "learning_rate": 0.0009902783451444317, + "loss": 0.94784307, + "num_input_tokens_seen": 39263168, + "router_z_loss_mlp": 0.24963379, + "step": 473, + "time_per_iteration": 2.7067277431488037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108649, + "balance_loss_mlp": 1.08496177, + "epoch": 0.09118891881492881, + "flos": 474300956160.0, + "grad_norm": 0.09402902414949979, + "language_loss": 0.97273493, + "learning_rate": 0.0009902171141526956, + "loss": 0.98382139, + "num_input_tokens_seen": 39330784, + "router_z_loss_mlp": 0.23693848, + "step": 474, + "time_per_iteration": 2.5281569957733154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087186, + "balance_loss_mlp": 1.06240201, + "epoch": 0.09138130050019239, + "flos": 545579076096.0, + "grad_norm": 0.06728788346792411, + "language_loss": 0.85273343, + "learning_rate": 0.000990155692841797, + "loss": 0.86360526, + "num_input_tokens_seen": 39417472, + "router_z_loss_mlp": 0.2479248, + "step": 475, + "time_per_iteration": 2.970107316970825 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010835, + "balance_loss_mlp": 1.0587163, + "epoch": 0.09157368218545595, + "flos": 732397441536.0, + "grad_norm": 0.07226189405033341, + "language_loss": 0.97062063, + "learning_rate": 0.0009900940812355818, + "loss": 0.98145562, + "num_input_tokens_seen": 39488656, + "router_z_loss_mlp": 0.24768066, + "step": 476, + "time_per_iteration": 2.959184169769287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096233, + "balance_loss_mlp": 1.07208097, + "epoch": 0.0917660638707195, + "flos": 610709967360.0, + "grad_norm": 0.09034653129128065, + "language_loss": 0.92824447, + "learning_rate": 0.00099003227935797, + "loss": 0.93920678, + "num_input_tokens_seen": 39558224, + "router_z_loss_mlp": 0.24157715, + "step": 477, + "time_per_iteration": 2.7553765773773193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113263, + "balance_loss_mlp": 1.08839583, + "epoch": 0.09195844555598306, + "flos": 655561257984.0, + "grad_norm": 0.09830094540804109, + "language_loss": 0.95358098, + "learning_rate": 0.000989970287232955, + "loss": 0.96471357, + "num_input_tokens_seen": 39629856, + "router_z_loss_mlp": 0.2487793, + "step": 478, + "time_per_iteration": 2.7916457653045654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112064, + "balance_loss_mlp": 1.09633327, + "epoch": 0.09215082724124664, + "flos": 476339387904.0, + "grad_norm": 0.08054303064285366, + "language_loss": 0.93560576, + "learning_rate": 0.0009899081048846043, + "loss": 0.94681215, + "num_input_tokens_seen": 39695984, + "router_z_loss_mlp": 0.24267578, + "step": 479, + "time_per_iteration": 2.554161787033081 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114732, + "balance_loss_mlp": 1.12177348, + "epoch": 0.0923432089265102, + "flos": 524051182080.0, + "grad_norm": 0.1186512856896222, + "language_loss": 0.97593725, + "learning_rate": 0.0009898457323370593, + "loss": 0.98741049, + "num_input_tokens_seen": 39760256, + "router_z_loss_mlp": 0.25549316, + "step": 480, + "time_per_iteration": 2.5794191360473633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131558, + "balance_loss_mlp": 1.10608315, + "epoch": 0.09253559061177376, + "flos": 545302651392.0, + "grad_norm": 0.10688941209840569, + "language_loss": 0.96892118, + "learning_rate": 0.000989783169614535, + "loss": 0.98023689, + "num_input_tokens_seen": 39827984, + "router_z_loss_mlp": 0.25512695, + "step": 481, + "time_per_iteration": 2.6676101684570312 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01336494, + "balance_loss_mlp": 1.32304764, + "epoch": 0.09272797229703732, + "flos": 1537222219776.0, + "grad_norm": 0.112558059824644, + "language_loss": 0.78752756, + "learning_rate": 0.0009897204167413206, + "loss": 0.80089253, + "num_input_tokens_seen": 40056688, + "router_z_loss_mlp": 0.13476562, + "step": 482, + "time_per_iteration": 4.910710096359253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121205, + "balance_loss_mlp": 1.09537172, + "epoch": 0.09292035398230089, + "flos": 689501624832.0, + "grad_norm": 0.08905484371867754, + "language_loss": 0.93989253, + "learning_rate": 0.000989657473741779, + "loss": 0.95110452, + "num_input_tokens_seen": 40133120, + "router_z_loss_mlp": 0.25866699, + "step": 483, + "time_per_iteration": 2.8736467361450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120092, + "balance_loss_mlp": 1.09219658, + "epoch": 0.09311273566756445, + "flos": 509482414080.0, + "grad_norm": 0.10011855628381364, + "language_loss": 0.94861096, + "learning_rate": 0.0009895943406403465, + "loss": 0.95981193, + "num_input_tokens_seen": 40206464, + "router_z_loss_mlp": 0.27905273, + "step": 484, + "time_per_iteration": 2.7233312129974365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114409, + "balance_loss_mlp": 1.08641887, + "epoch": 0.09330511735282801, + "flos": 659111413248.0, + "grad_norm": 0.10884122740481975, + "language_loss": 0.87602448, + "learning_rate": 0.0009895310174615338, + "loss": 0.88716859, + "num_input_tokens_seen": 40277744, + "router_z_loss_mlp": 0.2800293, + "step": 485, + "time_per_iteration": 2.7538061141967773 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098211, + "balance_loss_mlp": 1.08533621, + "epoch": 0.09349749903809157, + "flos": 1452054809088.0, + "grad_norm": 0.04867374252302138, + "language_loss": 0.75718516, + "learning_rate": 0.0009894675042299251, + "loss": 0.76816726, + "num_input_tokens_seen": 40503664, + "router_z_loss_mlp": 0.12890625, + "step": 486, + "time_per_iteration": 4.681119441986084 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121456, + "balance_loss_mlp": 1.09291732, + "epoch": 0.09368988072335514, + "flos": 520614508032.0, + "grad_norm": 0.07858969791005947, + "language_loss": 0.92458618, + "learning_rate": 0.0009894038009701782, + "loss": 0.93580067, + "num_input_tokens_seen": 40571376, + "router_z_loss_mlp": 0.28515625, + "step": 487, + "time_per_iteration": 2.6114649772644043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153128, + "balance_loss_mlp": 1.12148952, + "epoch": 0.0938822624086187, + "flos": 497502107136.0, + "grad_norm": 0.11959755259003642, + "language_loss": 0.91595036, + "learning_rate": 0.0009893399077070253, + "loss": 0.92748165, + "num_input_tokens_seen": 40638096, + "router_z_loss_mlp": 0.31616211, + "step": 488, + "time_per_iteration": 2.5603692531585693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127952, + "balance_loss_mlp": 1.09845996, + "epoch": 0.09407464409388226, + "flos": 532948405248.0, + "grad_norm": 0.09098963794592498, + "language_loss": 0.89760649, + "learning_rate": 0.0009892758244652718, + "loss": 0.90888608, + "num_input_tokens_seen": 40710992, + "router_z_loss_mlp": 0.29516602, + "step": 489, + "time_per_iteration": 2.65938401222229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01127724, + "balance_loss_mlp": 1.09568012, + "epoch": 0.09426702577914582, + "flos": 585736634880.0, + "grad_norm": 0.09102778373185845, + "language_loss": 0.94519842, + "learning_rate": 0.0009892115512697968, + "loss": 0.95647562, + "num_input_tokens_seen": 40778896, + "router_z_loss_mlp": 0.3203125, + "step": 490, + "time_per_iteration": 2.6538186073303223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120065, + "balance_loss_mlp": 1.08926105, + "epoch": 0.0944594074644094, + "flos": 503081929728.0, + "grad_norm": 0.07724049493821064, + "language_loss": 0.96624851, + "learning_rate": 0.0009891470881455537, + "loss": 0.97744912, + "num_input_tokens_seen": 40853376, + "router_z_loss_mlp": 0.30810547, + "step": 491, + "time_per_iteration": 2.699535608291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122711, + "balance_loss_mlp": 1.09145451, + "epoch": 0.09465178914967295, + "flos": 570748847616.0, + "grad_norm": 0.0816499633869022, + "language_loss": 0.94510269, + "learning_rate": 0.0009890824351175692, + "loss": 0.95632982, + "num_input_tokens_seen": 40923776, + "router_z_loss_mlp": 0.31225586, + "step": 492, + "time_per_iteration": 2.678191661834717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125893, + "balance_loss_mlp": 1.09418344, + "epoch": 0.09484417083493651, + "flos": 549098707968.0, + "grad_norm": 0.07977284094064935, + "language_loss": 0.98609412, + "learning_rate": 0.0009890175922109435, + "loss": 0.99735302, + "num_input_tokens_seen": 40996848, + "router_z_loss_mlp": 0.31689453, + "step": 493, + "time_per_iteration": 2.6466987133026123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138627, + "balance_loss_mlp": 1.10534418, + "epoch": 0.09503655252020007, + "flos": 823552109568.0, + "grad_norm": 0.09331424233507904, + "language_loss": 0.96939492, + "learning_rate": 0.0009889525594508513, + "loss": 0.9807812, + "num_input_tokens_seen": 41071280, + "router_z_loss_mlp": 0.33300781, + "step": 494, + "time_per_iteration": 3.009894371032715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01153225, + "balance_loss_mlp": 1.12218332, + "epoch": 0.09522893420546363, + "flos": 404397757440.0, + "grad_norm": 0.08141129996203125, + "language_loss": 0.91043431, + "learning_rate": 0.0009888873368625404, + "loss": 0.92196655, + "num_input_tokens_seen": 41136304, + "router_z_loss_mlp": 0.31030273, + "step": 495, + "time_per_iteration": 2.4904890060424805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01171726, + "balance_loss_mlp": 1.14025438, + "epoch": 0.0954213158907272, + "flos": 690707810304.0, + "grad_norm": 0.08256479818708104, + "language_loss": 0.94339681, + "learning_rate": 0.0009888219244713326, + "loss": 0.95511413, + "num_input_tokens_seen": 41212384, + "router_z_loss_mlp": 0.31445312, + "step": 496, + "time_per_iteration": 2.8060483932495117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01181664, + "balance_loss_mlp": 1.15033531, + "epoch": 0.09561369757599077, + "flos": 518739019776.0, + "grad_norm": 0.10472312979641793, + "language_loss": 0.94370055, + "learning_rate": 0.0009887563223026229, + "loss": 0.95551717, + "num_input_tokens_seen": 41282528, + "router_z_loss_mlp": 0.31323242, + "step": 497, + "time_per_iteration": 2.6536803245544434 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228939, + "balance_loss_mlp": 1.21549225, + "epoch": 0.09580607926125433, + "flos": 1384825849344.0, + "grad_norm": 0.04877985805939708, + "language_loss": 0.7906816, + "learning_rate": 0.0009886905303818805, + "loss": 0.80297101, + "num_input_tokens_seen": 41512256, + "router_z_loss_mlp": 0.13476562, + "step": 498, + "time_per_iteration": 4.874605178833008 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01197245, + "balance_loss_mlp": 1.16455829, + "epoch": 0.09599846094651789, + "flos": 717090969600.0, + "grad_norm": 0.08863465655244346, + "language_loss": 0.93284124, + "learning_rate": 0.0009886245487346482, + "loss": 0.94481373, + "num_input_tokens_seen": 41596816, + "router_z_loss_mlp": 0.3269043, + "step": 499, + "time_per_iteration": 3.047938108444214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011865, + "balance_loss_mlp": 1.15474319, + "epoch": 0.09619084263178146, + "flos": 385824909312.0, + "grad_norm": 0.09673466805801513, + "language_loss": 0.96238041, + "learning_rate": 0.0009885583773865422, + "loss": 0.97424543, + "num_input_tokens_seen": 41658544, + "router_z_loss_mlp": 0.31762695, + "step": 500, + "time_per_iteration": 2.402763843536377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140705, + "balance_loss_mlp": 1.1099968, + "epoch": 0.09638322431704502, + "flos": 533869401600.0, + "grad_norm": 0.08556524095898377, + "language_loss": 0.93457472, + "learning_rate": 0.0009884920163632524, + "loss": 0.94598186, + "num_input_tokens_seen": 41730736, + "router_z_loss_mlp": 0.30688477, + "step": 501, + "time_per_iteration": 2.7420296669006348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155853, + "balance_loss_mlp": 1.12373805, + "epoch": 0.09657560600230858, + "flos": 500426629632.0, + "grad_norm": 0.08462195742795481, + "language_loss": 0.95688182, + "learning_rate": 0.000988425465690543, + "loss": 0.96844035, + "num_input_tokens_seen": 41797824, + "router_z_loss_mlp": 0.32104492, + "step": 502, + "time_per_iteration": 2.5425736904144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01163304, + "balance_loss_mlp": 1.13099861, + "epoch": 0.09676798768757214, + "flos": 528995197440.0, + "grad_norm": 0.07192036847451248, + "language_loss": 0.92721838, + "learning_rate": 0.0009883587253942505, + "loss": 0.93885148, + "num_input_tokens_seen": 41875520, + "router_z_loss_mlp": 0.32324219, + "step": 503, + "time_per_iteration": 2.8340742588043213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01188959, + "balance_loss_mlp": 1.15598607, + "epoch": 0.09696036937283571, + "flos": 463379857920.0, + "grad_norm": 0.0888689340699796, + "language_loss": 0.99166393, + "learning_rate": 0.0009882917955002862, + "loss": 1.00355351, + "num_input_tokens_seen": 41942224, + "router_z_loss_mlp": 0.32983398, + "step": 504, + "time_per_iteration": 2.560448169708252 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147535, + "balance_loss_mlp": 1.11606395, + "epoch": 0.09715275105809927, + "flos": 534716204544.0, + "grad_norm": 0.07251663236407552, + "language_loss": 0.9150176, + "learning_rate": 0.0009882246760346343, + "loss": 0.92649293, + "num_input_tokens_seen": 42007552, + "router_z_loss_mlp": 0.31420898, + "step": 505, + "time_per_iteration": 2.6460299491882324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0114081, + "balance_loss_mlp": 1.10714495, + "epoch": 0.09734513274336283, + "flos": 454713979392.0, + "grad_norm": 0.10061537251918176, + "language_loss": 0.96100289, + "learning_rate": 0.0009881573670233533, + "loss": 0.97241098, + "num_input_tokens_seen": 42071760, + "router_z_loss_mlp": 0.33666992, + "step": 506, + "time_per_iteration": 2.5137040615081787 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109977, + "balance_loss_mlp": 1.08029366, + "epoch": 0.09753751442862639, + "flos": 508551243264.0, + "grad_norm": 0.0762964042901656, + "language_loss": 0.91185808, + "learning_rate": 0.0009880898684925747, + "loss": 0.92295784, + "num_input_tokens_seen": 42140688, + "router_z_loss_mlp": 0.29663086, + "step": 507, + "time_per_iteration": 2.6571738719940186 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110119, + "balance_loss_mlp": 1.07133985, + "epoch": 0.09772989611388996, + "flos": 484030425600.0, + "grad_norm": 0.07531505250568626, + "language_loss": 0.89554358, + "learning_rate": 0.0009880221804685037, + "loss": 0.90655547, + "num_input_tokens_seen": 42208544, + "router_z_loss_mlp": 0.29882812, + "step": 508, + "time_per_iteration": 2.596289873123169 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01404721, + "balance_loss_mlp": 1.39136958, + "epoch": 0.09792227779915352, + "flos": 1565306339328.0, + "grad_norm": 0.10151454340945995, + "language_loss": 0.79344422, + "learning_rate": 0.000987954302977419, + "loss": 0.80749142, + "num_input_tokens_seen": 42426624, + "router_z_loss_mlp": 0.13378906, + "step": 509, + "time_per_iteration": 4.724441051483154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116621, + "balance_loss_mlp": 1.08655643, + "epoch": 0.09811465948441708, + "flos": 587529165312.0, + "grad_norm": 0.08257009801201759, + "language_loss": 0.94708043, + "learning_rate": 0.0009878862360456733, + "loss": 0.95824659, + "num_input_tokens_seen": 42494592, + "router_z_loss_mlp": 0.30029297, + "step": 510, + "time_per_iteration": 2.703011989593506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122701, + "balance_loss_mlp": 1.09406662, + "epoch": 0.09830704116968064, + "flos": 612719285760.0, + "grad_norm": 0.06191460590209878, + "language_loss": 0.88457662, + "learning_rate": 0.0009878179796996922, + "loss": 0.89580369, + "num_input_tokens_seen": 42564944, + "router_z_loss_mlp": 0.28637695, + "step": 511, + "time_per_iteration": 2.7212226390838623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128587, + "balance_loss_mlp": 1.09885597, + "epoch": 0.09849942285494422, + "flos": 538528227840.0, + "grad_norm": 0.06874751685339883, + "language_loss": 0.9199326, + "learning_rate": 0.0009877495339659754, + "loss": 0.9312185, + "num_input_tokens_seen": 42645616, + "router_z_loss_mlp": 0.29724121, + "step": 512, + "time_per_iteration": 2.7520575523376465 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111609, + "balance_loss_mlp": 1.08826661, + "epoch": 0.09869180454020778, + "flos": 620193535488.0, + "grad_norm": 0.06953003964378547, + "language_loss": 0.87301105, + "learning_rate": 0.000987680898871096, + "loss": 0.88417196, + "num_input_tokens_seen": 42713632, + "router_z_loss_mlp": 0.27832031, + "step": 513, + "time_per_iteration": 2.7121992111206055 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134292, + "balance_loss_mlp": 1.10401261, + "epoch": 0.09888418622547133, + "flos": 811375363584.0, + "grad_norm": 0.1024184057853134, + "language_loss": 0.87763435, + "learning_rate": 0.0009876120744417, + "loss": 0.88897729, + "num_input_tokens_seen": 42789088, + "router_z_loss_mlp": 0.30273438, + "step": 514, + "time_per_iteration": 2.971573829650879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123143, + "balance_loss_mlp": 1.09267306, + "epoch": 0.0990765679107349, + "flos": 535548450816.0, + "grad_norm": 0.06764912074049458, + "language_loss": 0.95588082, + "learning_rate": 0.0009875430607045078, + "loss": 0.9671123, + "num_input_tokens_seen": 42861168, + "router_z_loss_mlp": 0.3046875, + "step": 515, + "time_per_iteration": 2.6630361080169678 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108813, + "balance_loss_mlp": 1.08072746, + "epoch": 0.09926894959599845, + "flos": 587607740928.0, + "grad_norm": 0.06593749006245919, + "language_loss": 0.92788792, + "learning_rate": 0.000987473857686313, + "loss": 0.93897605, + "num_input_tokens_seen": 42934112, + "router_z_loss_mlp": 0.28076172, + "step": 516, + "time_per_iteration": 2.710068702697754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121556, + "balance_loss_mlp": 1.09039485, + "epoch": 0.09946133128126203, + "flos": 640947409920.0, + "grad_norm": 0.08862761474564218, + "language_loss": 0.9451825, + "learning_rate": 0.0009874044654139824, + "loss": 0.95639801, + "num_input_tokens_seen": 43005248, + "router_z_loss_mlp": 0.3112793, + "step": 517, + "time_per_iteration": 2.729975461959839 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117034, + "balance_loss_mlp": 1.08520555, + "epoch": 0.09965371296652559, + "flos": 465546327552.0, + "grad_norm": 0.09157938746936445, + "language_loss": 0.9250825, + "learning_rate": 0.0009873348839144563, + "loss": 0.93625283, + "num_input_tokens_seen": 43070576, + "router_z_loss_mlp": 0.31811523, + "step": 518, + "time_per_iteration": 2.5117127895355225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112516, + "balance_loss_mlp": 1.09540534, + "epoch": 0.09984609465178915, + "flos": 483365505024.0, + "grad_norm": 0.07736257304557469, + "language_loss": 0.9674046, + "learning_rate": 0.000987265113214749, + "loss": 0.97865617, + "num_input_tokens_seen": 43138048, + "router_z_loss_mlp": 0.29711914, + "step": 519, + "time_per_iteration": 2.5816774368286133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147544, + "balance_loss_mlp": 1.11421299, + "epoch": 0.1000384763370527, + "flos": 568764260352.0, + "grad_norm": 0.08763817133734854, + "language_loss": 0.96583092, + "learning_rate": 0.0009871951533419476, + "loss": 0.97730637, + "num_input_tokens_seen": 43207600, + "router_z_loss_mlp": 0.33325195, + "step": 520, + "time_per_iteration": 2.638664484024048 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140597, + "balance_loss_mlp": 1.108482, + "epoch": 0.10023085802231628, + "flos": 545515057152.0, + "grad_norm": 0.10925869968591369, + "language_loss": 0.88377398, + "learning_rate": 0.0009871250043232132, + "loss": 0.89517999, + "num_input_tokens_seen": 43285104, + "router_z_loss_mlp": 0.32104492, + "step": 521, + "time_per_iteration": 2.70491886138916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01136934, + "balance_loss_mlp": 1.10555792, + "epoch": 0.10042323970757984, + "flos": 503208557568.0, + "grad_norm": 0.07694864026409119, + "language_loss": 0.87725985, + "learning_rate": 0.0009870546661857797, + "loss": 0.8886292, + "num_input_tokens_seen": 43353312, + "router_z_loss_mlp": 0.31347656, + "step": 522, + "time_per_iteration": 2.653456211090088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126678, + "balance_loss_mlp": 1.09380031, + "epoch": 0.1006156213928434, + "flos": 770084402688.0, + "grad_norm": 0.08414569380370593, + "language_loss": 0.95787346, + "learning_rate": 0.0009869841389569553, + "loss": 0.96914017, + "num_input_tokens_seen": 43427680, + "router_z_loss_mlp": 0.32885742, + "step": 523, + "time_per_iteration": 2.9442663192749023 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116557, + "balance_loss_mlp": 1.08625388, + "epoch": 0.10080800307810696, + "flos": 489786338304.0, + "grad_norm": 0.06587351152736676, + "language_loss": 0.88897854, + "learning_rate": 0.0009869134226641206, + "loss": 0.90014416, + "num_input_tokens_seen": 43495200, + "router_z_loss_mlp": 0.30297852, + "step": 524, + "time_per_iteration": 2.5559868812561035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110225, + "balance_loss_mlp": 1.07746601, + "epoch": 0.10100038476337053, + "flos": 454478252544.0, + "grad_norm": 0.09167866019985617, + "language_loss": 0.88383424, + "learning_rate": 0.0009868425173347303, + "loss": 0.89493656, + "num_input_tokens_seen": 43566256, + "router_z_loss_mlp": 0.32788086, + "step": 525, + "time_per_iteration": 2.645116090774536 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111349, + "balance_loss_mlp": 1.08216143, + "epoch": 0.10119276644863409, + "flos": 556155348480.0, + "grad_norm": 0.07288604326691553, + "language_loss": 0.96749896, + "learning_rate": 0.0009867714229963125, + "loss": 0.97863394, + "num_input_tokens_seen": 43639696, + "router_z_loss_mlp": 0.31323242, + "step": 526, + "time_per_iteration": 2.730703592300415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106354, + "balance_loss_mlp": 1.07540703, + "epoch": 0.10138514813389765, + "flos": 515990587392.0, + "grad_norm": 0.07095113284061857, + "language_loss": 0.93916923, + "learning_rate": 0.000986700139676468, + "loss": 0.95023274, + "num_input_tokens_seen": 43703872, + "router_z_loss_mlp": 0.30932617, + "step": 527, + "time_per_iteration": 2.5836338996887207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110446, + "balance_loss_mlp": 1.07833052, + "epoch": 0.10157752981916121, + "flos": 500323322880.0, + "grad_norm": 0.06933811905919615, + "language_loss": 0.91673893, + "learning_rate": 0.0009866286674028717, + "loss": 0.92784333, + "num_input_tokens_seen": 43774416, + "router_z_loss_mlp": 0.32104492, + "step": 528, + "time_per_iteration": 2.7084739208221436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101831, + "balance_loss_mlp": 1.07100391, + "epoch": 0.10176991150442478, + "flos": 656444376576.0, + "grad_norm": 0.07189407365130172, + "language_loss": 0.88586026, + "learning_rate": 0.0009865570062032717, + "loss": 0.8968786, + "num_input_tokens_seen": 43853376, + "router_z_loss_mlp": 0.30810547, + "step": 529, + "time_per_iteration": 2.9141628742218018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103952, + "balance_loss_mlp": 1.07443571, + "epoch": 0.10196229318968834, + "flos": 572974953984.0, + "grad_norm": 0.06841647032337263, + "language_loss": 0.93659967, + "learning_rate": 0.0009864851561054893, + "loss": 0.94763923, + "num_input_tokens_seen": 43929632, + "router_z_loss_mlp": 0.29516602, + "step": 530, + "time_per_iteration": 2.7539894580841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090977, + "balance_loss_mlp": 1.06110358, + "epoch": 0.1021546748749519, + "flos": 517946061312.0, + "grad_norm": 0.07340246055426732, + "language_loss": 0.91722125, + "learning_rate": 0.0009864131171374191, + "loss": 0.92813098, + "num_input_tokens_seen": 44002144, + "router_z_loss_mlp": 0.29882812, + "step": 531, + "time_per_iteration": 2.6921956539154053 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109944, + "balance_loss_mlp": 1.06749225, + "epoch": 0.10234705656021546, + "flos": 609470286336.0, + "grad_norm": 0.07867637119915549, + "language_loss": 0.91107762, + "learning_rate": 0.0009863408893270292, + "loss": 0.92207205, + "num_input_tokens_seen": 44078272, + "router_z_loss_mlp": 0.31933594, + "step": 532, + "time_per_iteration": 2.7911570072174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106913, + "balance_loss_mlp": 1.07396317, + "epoch": 0.10253943824547904, + "flos": 601473710592.0, + "grad_norm": 0.08191923529880715, + "language_loss": 0.86522454, + "learning_rate": 0.0009862684727023605, + "loss": 0.87629366, + "num_input_tokens_seen": 44152304, + "router_z_loss_mlp": 0.3293457, + "step": 533, + "time_per_iteration": 2.7452800273895264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105875, + "balance_loss_mlp": 1.07466602, + "epoch": 0.1027318199307426, + "flos": 662647011840.0, + "grad_norm": 0.07282647554851075, + "language_loss": 0.90315968, + "learning_rate": 0.0009861958672915283, + "loss": 0.91421843, + "num_input_tokens_seen": 44226720, + "router_z_loss_mlp": 0.31201172, + "step": 534, + "time_per_iteration": 2.8041269779205322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096602, + "balance_loss_mlp": 1.0673244, + "epoch": 0.10292420161600616, + "flos": 682962928128.0, + "grad_norm": 0.058349855756870184, + "language_loss": 0.90126884, + "learning_rate": 0.0009861230731227201, + "loss": 0.9122349, + "num_input_tokens_seen": 44303600, + "router_z_loss_mlp": 0.29248047, + "step": 535, + "time_per_iteration": 2.8627805709838867 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108033, + "balance_loss_mlp": 1.07615674, + "epoch": 0.10311658330126972, + "flos": 490042414080.0, + "grad_norm": 0.091555564896082, + "language_loss": 0.91954774, + "learning_rate": 0.0009860500902241973, + "loss": 0.93062806, + "num_input_tokens_seen": 44370960, + "router_z_loss_mlp": 0.31884766, + "step": 536, + "time_per_iteration": 2.6052157878875732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120335, + "balance_loss_mlp": 1.08800602, + "epoch": 0.10330896498653329, + "flos": 431508446208.0, + "grad_norm": 0.0585767653270487, + "language_loss": 0.96574026, + "learning_rate": 0.0009859769186242942, + "loss": 0.97694361, + "num_input_tokens_seen": 44435584, + "router_z_loss_mlp": 0.32324219, + "step": 537, + "time_per_iteration": 2.51180362701416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116517, + "balance_loss_mlp": 1.08571362, + "epoch": 0.10350134667179685, + "flos": 549330052608.0, + "grad_norm": 0.0744119924563098, + "language_loss": 0.8926785, + "learning_rate": 0.0009859035583514187, + "loss": 0.90384364, + "num_input_tokens_seen": 44505456, + "router_z_loss_mlp": 0.30834961, + "step": 538, + "time_per_iteration": 2.6369993686676025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01146613, + "balance_loss_mlp": 1.11380613, + "epoch": 0.10369372835706041, + "flos": 640327569408.0, + "grad_norm": 0.09976070350989504, + "language_loss": 0.90389431, + "learning_rate": 0.0009858300094340517, + "loss": 0.91536051, + "num_input_tokens_seen": 44580208, + "router_z_loss_mlp": 0.328125, + "step": 539, + "time_per_iteration": 2.7695086002349854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01150737, + "balance_loss_mlp": 1.11838388, + "epoch": 0.10388611004232397, + "flos": 521500598784.0, + "grad_norm": 0.08771902350159133, + "language_loss": 0.85304511, + "learning_rate": 0.0009857562719007473, + "loss": 0.8645525, + "num_input_tokens_seen": 44646576, + "router_z_loss_mlp": 0.32324219, + "step": 540, + "time_per_iteration": 2.59881329536438 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144681, + "balance_loss_mlp": 1.11320961, + "epoch": 0.10407849172758753, + "flos": 702111946752.0, + "grad_norm": 0.07496368213999542, + "language_loss": 0.88249481, + "learning_rate": 0.0009856823457801331, + "loss": 0.89394164, + "num_input_tokens_seen": 44726752, + "router_z_loss_mlp": 0.31494141, + "step": 541, + "time_per_iteration": 2.873481035232544 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119735, + "balance_loss_mlp": 1.08738184, + "epoch": 0.1042708734128511, + "flos": 502652736000.0, + "grad_norm": 0.06973546911765124, + "language_loss": 0.94998306, + "learning_rate": 0.00098560823110091, + "loss": 0.96118045, + "num_input_tokens_seen": 44795824, + "router_z_loss_mlp": 0.32373047, + "step": 542, + "time_per_iteration": 2.661374807357788 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118044, + "balance_loss_mlp": 1.08757377, + "epoch": 0.10446325509811466, + "flos": 485331153408.0, + "grad_norm": 0.0792045331206184, + "language_loss": 0.95517921, + "learning_rate": 0.000985533927891851, + "loss": 0.96635967, + "num_input_tokens_seen": 44868496, + "router_z_loss_mlp": 0.30419922, + "step": 543, + "time_per_iteration": 2.7264697551727295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096256, + "balance_loss_mlp": 1.06502366, + "epoch": 0.10465563678337822, + "flos": 568365590016.0, + "grad_norm": 0.0919664039836503, + "language_loss": 0.93718112, + "learning_rate": 0.0009854594361818044, + "loss": 0.94814372, + "num_input_tokens_seen": 44939888, + "router_z_loss_mlp": 0.31201172, + "step": 544, + "time_per_iteration": 2.6869821548461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099422, + "balance_loss_mlp": 1.0683322, + "epoch": 0.10484801846864178, + "flos": 625806853632.0, + "grad_norm": 0.1054615502202609, + "language_loss": 0.927598, + "learning_rate": 0.0009853847559996897, + "loss": 0.9385922, + "num_input_tokens_seen": 45012720, + "router_z_loss_mlp": 0.31103516, + "step": 545, + "time_per_iteration": 2.7953526973724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100313, + "balance_loss_mlp": 1.06772113, + "epoch": 0.10504040015390535, + "flos": 743063874048.0, + "grad_norm": 0.0768702593450629, + "language_loss": 0.92008656, + "learning_rate": 0.0009853098873745, + "loss": 0.93108964, + "num_input_tokens_seen": 45093744, + "router_z_loss_mlp": 0.32592773, + "step": 546, + "time_per_iteration": 3.0344293117523193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106321, + "balance_loss_mlp": 1.07430172, + "epoch": 0.10523278183916891, + "flos": 586382616576.0, + "grad_norm": 0.072035501246702, + "language_loss": 0.90983582, + "learning_rate": 0.0009852348303353027, + "loss": 0.92089903, + "num_input_tokens_seen": 45172784, + "router_z_loss_mlp": 0.32006836, + "step": 547, + "time_per_iteration": 2.7647972106933594 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110403, + "balance_loss_mlp": 1.07100892, + "epoch": 0.10542516352443247, + "flos": 869270552064.0, + "grad_norm": 0.07817580313906373, + "language_loss": 0.84611928, + "learning_rate": 0.000985159584911237, + "loss": 0.85715961, + "num_input_tokens_seen": 45255600, + "router_z_loss_mlp": 0.33007812, + "step": 548, + "time_per_iteration": 3.143122434616089 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104478, + "balance_loss_mlp": 1.07212472, + "epoch": 0.10561754520969603, + "flos": 505182970368.0, + "grad_norm": 0.08898596974063745, + "language_loss": 0.91126573, + "learning_rate": 0.0009850841511315162, + "loss": 0.92231047, + "num_input_tokens_seen": 45325072, + "router_z_loss_mlp": 0.32348633, + "step": 549, + "time_per_iteration": 2.6164846420288086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112982, + "balance_loss_mlp": 1.07946038, + "epoch": 0.1058099268949596, + "flos": 559690947072.0, + "grad_norm": 0.06224197989448247, + "language_loss": 0.92054999, + "learning_rate": 0.0009850085290254256, + "loss": 0.93167984, + "num_input_tokens_seen": 45401440, + "router_z_loss_mlp": 0.33520508, + "step": 550, + "time_per_iteration": 2.7473480701446533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110676, + "balance_loss_mlp": 1.07431078, + "epoch": 0.10600230858022316, + "flos": 561773048832.0, + "grad_norm": 0.05678957528127819, + "language_loss": 0.88957977, + "learning_rate": 0.0009849327186223246, + "loss": 0.90064728, + "num_input_tokens_seen": 45479264, + "router_z_loss_mlp": 0.32446289, + "step": 551, + "time_per_iteration": 2.805126905441284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094878, + "balance_loss_mlp": 1.06297779, + "epoch": 0.10619469026548672, + "flos": 494079989760.0, + "grad_norm": 0.07906939671673464, + "language_loss": 0.95596325, + "learning_rate": 0.000984856719951646, + "loss": 0.96691203, + "num_input_tokens_seen": 45547328, + "router_z_loss_mlp": 0.31860352, + "step": 552, + "time_per_iteration": 2.5688273906707764 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105536, + "balance_loss_mlp": 1.07370734, + "epoch": 0.10638707195075028, + "flos": 675843678720.0, + "grad_norm": 0.06469368191660979, + "language_loss": 0.93170857, + "learning_rate": 0.0009847805330428943, + "loss": 0.94276392, + "num_input_tokens_seen": 45631152, + "router_z_loss_mlp": 0.31811523, + "step": 553, + "time_per_iteration": 2.8858227729797363 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105116, + "balance_loss_mlp": 1.07080746, + "epoch": 0.10657945363601386, + "flos": 487811925504.0, + "grad_norm": 0.07365688544553677, + "language_loss": 0.94454086, + "learning_rate": 0.0009847041579256481, + "loss": 0.95559192, + "num_input_tokens_seen": 45698208, + "router_z_loss_mlp": 0.34326172, + "step": 554, + "time_per_iteration": 2.5912039279937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114154, + "balance_loss_mlp": 1.08158636, + "epoch": 0.10677183532127742, + "flos": 482706376704.0, + "grad_norm": 0.06731486395760358, + "language_loss": 0.95310724, + "learning_rate": 0.0009846275946295592, + "loss": 0.96424878, + "num_input_tokens_seen": 45766640, + "router_z_loss_mlp": 0.32568359, + "step": 555, + "time_per_iteration": 2.6071619987487793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120557, + "balance_loss_mlp": 1.08755958, + "epoch": 0.10696421700654098, + "flos": 655917668352.0, + "grad_norm": 0.06239681935918944, + "language_loss": 0.88169777, + "learning_rate": 0.0009845508431843518, + "loss": 0.89290333, + "num_input_tokens_seen": 45851408, + "router_z_loss_mlp": 0.32983398, + "step": 556, + "time_per_iteration": 2.9906973838806152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122479, + "balance_loss_mlp": 1.08986306, + "epoch": 0.10715659869180454, + "flos": 567483881472.0, + "grad_norm": 0.06803394611182671, + "language_loss": 0.89010829, + "learning_rate": 0.0009844739036198233, + "loss": 0.90133309, + "num_input_tokens_seen": 45919824, + "router_z_loss_mlp": 0.32592773, + "step": 557, + "time_per_iteration": 2.6462793350219727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113246, + "balance_loss_mlp": 1.09927225, + "epoch": 0.10734898037706811, + "flos": 540432829440.0, + "grad_norm": 0.0683091886411484, + "language_loss": 0.96000761, + "learning_rate": 0.0009843967759658448, + "loss": 0.97133219, + "num_input_tokens_seen": 45991024, + "router_z_loss_mlp": 0.33203125, + "step": 558, + "time_per_iteration": 2.664320707321167 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01369087, + "balance_loss_mlp": 1.3546865, + "epoch": 0.10754136206233167, + "flos": 1475870008320.0, + "grad_norm": 0.12144998025248735, + "language_loss": 0.72767758, + "learning_rate": 0.0009843194602523592, + "loss": 0.74136841, + "num_input_tokens_seen": 46212736, + "router_z_loss_mlp": 0.14355469, + "step": 559, + "time_per_iteration": 4.836310148239136 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124853, + "balance_loss_mlp": 1.0925231, + "epoch": 0.10773374374759523, + "flos": 512155243008.0, + "grad_norm": 0.06725764235558847, + "language_loss": 0.96045369, + "learning_rate": 0.000984241956509384, + "loss": 0.97170222, + "num_input_tokens_seen": 46283920, + "router_z_loss_mlp": 0.32324219, + "step": 560, + "time_per_iteration": 2.7409372329711914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134795, + "balance_loss_mlp": 1.10005689, + "epoch": 0.10792612543285879, + "flos": 496261016064.0, + "grad_norm": 0.08502468521942065, + "language_loss": 0.91520619, + "learning_rate": 0.0009841642647670078, + "loss": 0.92655414, + "num_input_tokens_seen": 46349664, + "router_z_loss_mlp": 0.34741211, + "step": 561, + "time_per_iteration": 2.5360167026519775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134435, + "balance_loss_mlp": 1.10050821, + "epoch": 0.10811850711812235, + "flos": 735131317248.0, + "grad_norm": 0.08550854990342285, + "language_loss": 0.86122006, + "learning_rate": 0.0009840863850553944, + "loss": 0.87256444, + "num_input_tokens_seen": 46432688, + "router_z_loss_mlp": 0.33911133, + "step": 562, + "time_per_iteration": 3.0013930797576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118751, + "balance_loss_mlp": 1.08604038, + "epoch": 0.10831088880338592, + "flos": 611257024512.0, + "grad_norm": 0.07414056330929218, + "language_loss": 0.92513216, + "learning_rate": 0.0009840083174047782, + "loss": 0.93631971, + "num_input_tokens_seen": 46507216, + "router_z_loss_mlp": 0.3269043, + "step": 563, + "time_per_iteration": 2.761746883392334 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125103, + "balance_loss_mlp": 1.09353685, + "epoch": 0.10850327048864948, + "flos": 556022928384.0, + "grad_norm": 0.06849160846851732, + "language_loss": 0.86520386, + "learning_rate": 0.0009839300618454685, + "loss": 0.87645483, + "num_input_tokens_seen": 46590464, + "router_z_loss_mlp": 0.31518555, + "step": 564, + "time_per_iteration": 2.833545684814453 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124691, + "balance_loss_mlp": 1.09291005, + "epoch": 0.10869565217391304, + "flos": 602902476288.0, + "grad_norm": 0.06688991061359367, + "language_loss": 0.92471159, + "learning_rate": 0.0009838516184078466, + "loss": 0.9359585, + "num_input_tokens_seen": 46666240, + "router_z_loss_mlp": 0.31762695, + "step": 565, + "time_per_iteration": 2.838482618331909 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112559, + "balance_loss_mlp": 1.09345102, + "epoch": 0.1088880338591766, + "flos": 525922288128.0, + "grad_norm": 0.08266802783800845, + "language_loss": 0.89073956, + "learning_rate": 0.0009837729871223669, + "loss": 0.90199542, + "num_input_tokens_seen": 46734288, + "router_z_loss_mlp": 0.3215332, + "step": 566, + "time_per_iteration": 2.6670589447021484 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134729, + "balance_loss_mlp": 1.10073042, + "epoch": 0.10908041554444017, + "flos": 619986921984.0, + "grad_norm": 0.06816497946354988, + "language_loss": 0.89503658, + "learning_rate": 0.0009836941680195568, + "loss": 0.90638387, + "num_input_tokens_seen": 46809920, + "router_z_loss_mlp": 0.34033203, + "step": 567, + "time_per_iteration": 2.7894582748413086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131677, + "balance_loss_mlp": 1.09691525, + "epoch": 0.10927279722970373, + "flos": 897740195328.0, + "grad_norm": 0.07371226629870802, + "language_loss": 0.8534497, + "learning_rate": 0.0009836151611300166, + "loss": 0.86476642, + "num_input_tokens_seen": 46889984, + "router_z_loss_mlp": 0.34765625, + "step": 568, + "time_per_iteration": 3.204500913619995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116051, + "balance_loss_mlp": 1.08467555, + "epoch": 0.10946517891496729, + "flos": 528408852480.0, + "grad_norm": 0.061952855977424344, + "language_loss": 0.96103537, + "learning_rate": 0.0009835359664844194, + "loss": 0.97219586, + "num_input_tokens_seen": 46959536, + "router_z_loss_mlp": 0.3137207, + "step": 569, + "time_per_iteration": 2.6154720783233643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124163, + "balance_loss_mlp": 1.11014414, + "epoch": 0.10965756060023085, + "flos": 1559944714752.0, + "grad_norm": 0.03358522647050957, + "language_loss": 0.81036806, + "learning_rate": 0.0009834565841135114, + "loss": 0.82160974, + "num_input_tokens_seen": 47196960, + "router_z_loss_mlp": 0.140625, + "step": 570, + "time_per_iteration": 4.907090187072754 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112487, + "balance_loss_mlp": 1.09406638, + "epoch": 0.10984994228549443, + "flos": 512820163584.0, + "grad_norm": 0.08674533322611513, + "language_loss": 0.9339065, + "learning_rate": 0.0009833770140481118, + "loss": 0.9451552, + "num_input_tokens_seen": 47266560, + "router_z_loss_mlp": 0.30786133, + "step": 571, + "time_per_iteration": 2.694821357727051 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121358, + "balance_loss_mlp": 1.09072113, + "epoch": 0.11004232397075799, + "flos": 954314307072.0, + "grad_norm": 0.07582699316256973, + "language_loss": 0.84126109, + "learning_rate": 0.000983297256319112, + "loss": 0.85247469, + "num_input_tokens_seen": 47348512, + "router_z_loss_mlp": 0.30664062, + "step": 572, + "time_per_iteration": 3.208728313446045 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01144326, + "balance_loss_mlp": 1.11097169, + "epoch": 0.11023470565602154, + "flos": 487921024512.0, + "grad_norm": 0.07530566153242002, + "language_loss": 0.8789041, + "learning_rate": 0.000983217310957477, + "loss": 0.89034736, + "num_input_tokens_seen": 47425392, + "router_z_loss_mlp": 0.33349609, + "step": 573, + "time_per_iteration": 2.7521331310272217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0113474, + "balance_loss_mlp": 1.1014812, + "epoch": 0.1104270873412851, + "flos": 655521970176.0, + "grad_norm": 0.08427122985019045, + "language_loss": 0.91161472, + "learning_rate": 0.000983137177994244, + "loss": 0.92296207, + "num_input_tokens_seen": 47502336, + "router_z_loss_mlp": 0.33300781, + "step": 574, + "time_per_iteration": 2.869795083999634 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105984, + "balance_loss_mlp": 1.0752039, + "epoch": 0.11061946902654868, + "flos": 723097165824.0, + "grad_norm": 0.0803000190442887, + "language_loss": 0.87202144, + "learning_rate": 0.0009830568574605235, + "loss": 0.88308132, + "num_input_tokens_seen": 47583552, + "router_z_loss_mlp": 0.30737305, + "step": 575, + "time_per_iteration": 2.952505111694336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111674, + "balance_loss_mlp": 1.07963109, + "epoch": 0.11081185071181224, + "flos": 835113397248.0, + "grad_norm": 0.07764025760375837, + "language_loss": 0.89234924, + "learning_rate": 0.0009829763493874992, + "loss": 0.90346599, + "num_input_tokens_seen": 47663440, + "router_z_loss_mlp": 0.3203125, + "step": 576, + "time_per_iteration": 3.0367727279663086 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110641, + "balance_loss_mlp": 1.07508206, + "epoch": 0.1110042323970758, + "flos": 608776252416.0, + "grad_norm": 0.06795308301133055, + "language_loss": 0.94366598, + "learning_rate": 0.0009828956538064264, + "loss": 0.95473009, + "num_input_tokens_seen": 47741920, + "router_z_loss_mlp": 0.31347656, + "step": 577, + "time_per_iteration": 2.783268928527832 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091394, + "balance_loss_mlp": 1.0610671, + "epoch": 0.11119661408233936, + "flos": 595643604480.0, + "grad_norm": 0.0662915232098912, + "language_loss": 0.9183138, + "learning_rate": 0.0009828147707486344, + "loss": 0.92922771, + "num_input_tokens_seen": 47815136, + "router_z_loss_mlp": 0.30297852, + "step": 578, + "time_per_iteration": 2.6628670692443848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092993, + "balance_loss_mlp": 1.06109214, + "epoch": 0.11138899576760293, + "flos": 555573385728.0, + "grad_norm": 0.07355059798421615, + "language_loss": 0.87444091, + "learning_rate": 0.0009827337002455245, + "loss": 0.88537085, + "num_input_tokens_seen": 47881360, + "router_z_loss_mlp": 0.31884766, + "step": 579, + "time_per_iteration": 2.616842031478882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087398, + "balance_loss_mlp": 1.05857313, + "epoch": 0.11158137745286649, + "flos": 689418667008.0, + "grad_norm": 0.05531737995895799, + "language_loss": 0.89474124, + "learning_rate": 0.0009826524423285712, + "loss": 0.90561521, + "num_input_tokens_seen": 47962720, + "router_z_loss_mlp": 0.28808594, + "step": 580, + "time_per_iteration": 2.896409749984741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093471, + "balance_loss_mlp": 1.06393051, + "epoch": 0.11177375913813005, + "flos": 762688728576.0, + "grad_norm": 0.06807232662928764, + "language_loss": 0.9046967, + "learning_rate": 0.0009825709970293218, + "loss": 0.91563141, + "num_input_tokens_seen": 48035472, + "router_z_loss_mlp": 0.2956543, + "step": 581, + "time_per_iteration": 2.8843319416046143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096208, + "balance_loss_mlp": 1.0669775, + "epoch": 0.11196614082339361, + "flos": 806211588096.0, + "grad_norm": 0.07053725402235117, + "language_loss": 0.96166003, + "learning_rate": 0.0009824893643793956, + "loss": 0.9726221, + "num_input_tokens_seen": 48116944, + "router_z_loss_mlp": 0.29248047, + "step": 582, + "time_per_iteration": 3.04577898979187 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104715, + "balance_loss_mlp": 1.07305288, + "epoch": 0.11215852250865718, + "flos": 558350931456.0, + "grad_norm": 0.10752491555358674, + "language_loss": 0.89033759, + "learning_rate": 0.0009824075444104857, + "loss": 0.90138471, + "num_input_tokens_seen": 48187808, + "router_z_loss_mlp": 0.31689453, + "step": 583, + "time_per_iteration": 2.682020902633667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125233, + "balance_loss_mlp": 1.09497714, + "epoch": 0.11235090419392074, + "flos": 513322140672.0, + "grad_norm": 0.06606619546840543, + "language_loss": 0.94941097, + "learning_rate": 0.000982325537154357, + "loss": 0.9606632, + "num_input_tokens_seen": 48254464, + "router_z_loss_mlp": 0.30224609, + "step": 584, + "time_per_iteration": 2.577632427215576 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122311, + "balance_loss_mlp": 1.09045827, + "epoch": 0.1125432858791843, + "flos": 491209311744.0, + "grad_norm": 0.07452844115700766, + "language_loss": 0.95190644, + "learning_rate": 0.0009822433426428484, + "loss": 0.96312958, + "num_input_tokens_seen": 48318784, + "router_z_loss_mlp": 0.31860352, + "step": 585, + "time_per_iteration": 2.560591220855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126565, + "balance_loss_mlp": 1.09280539, + "epoch": 0.11273566756444786, + "flos": 510476193792.0, + "grad_norm": 0.11434848401200806, + "language_loss": 0.87964213, + "learning_rate": 0.0009821609609078697, + "loss": 0.89090776, + "num_input_tokens_seen": 48389248, + "router_z_loss_mlp": 0.3371582, + "step": 586, + "time_per_iteration": 2.633925437927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118464, + "balance_loss_mlp": 1.08785152, + "epoch": 0.11292804924971142, + "flos": 622149009408.0, + "grad_norm": 0.08000190427267627, + "language_loss": 0.905334, + "learning_rate": 0.0009820783919814045, + "loss": 0.91651857, + "num_input_tokens_seen": 48463312, + "router_z_loss_mlp": 0.3059082, + "step": 587, + "time_per_iteration": 2.806704044342041 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111289, + "balance_loss_mlp": 1.07857847, + "epoch": 0.113120430934975, + "flos": 477811823616.0, + "grad_norm": 0.09357252991594707, + "language_loss": 0.83955467, + "learning_rate": 0.0009819956358955095, + "loss": 0.8506676, + "num_input_tokens_seen": 48531856, + "router_z_loss_mlp": 0.32714844, + "step": 588, + "time_per_iteration": 2.5903711318969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109097, + "balance_loss_mlp": 1.07455039, + "epoch": 0.11331281262023855, + "flos": 466801975296.0, + "grad_norm": 0.06610764616840299, + "language_loss": 0.85348701, + "learning_rate": 0.0009819126926823127, + "loss": 0.86457801, + "num_input_tokens_seen": 48596640, + "router_z_loss_mlp": 0.34570312, + "step": 589, + "time_per_iteration": 2.5726494789123535 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108183, + "balance_loss_mlp": 1.07535291, + "epoch": 0.11350519430550211, + "flos": 650164727808.0, + "grad_norm": 0.06035980490561805, + "language_loss": 0.87806922, + "learning_rate": 0.000981829562374016, + "loss": 0.8891511, + "num_input_tokens_seen": 48669648, + "router_z_loss_mlp": 0.328125, + "step": 590, + "time_per_iteration": 2.7960643768310547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112987, + "balance_loss_mlp": 1.08041859, + "epoch": 0.11369757599076567, + "flos": 557547798528.0, + "grad_norm": 0.08830164474684658, + "language_loss": 0.98550045, + "learning_rate": 0.0009817462450028933, + "loss": 0.99663031, + "num_input_tokens_seen": 48737392, + "router_z_loss_mlp": 0.32568359, + "step": 591, + "time_per_iteration": 2.654860734939575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107596, + "balance_loss_mlp": 1.07526684, + "epoch": 0.11388995767602925, + "flos": 570774988800.0, + "grad_norm": 0.06245390963608315, + "language_loss": 0.86587834, + "learning_rate": 0.0009816627406012916, + "loss": 0.87695432, + "num_input_tokens_seen": 48817136, + "router_z_loss_mlp": 0.32348633, + "step": 592, + "time_per_iteration": 2.8017733097076416 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101808, + "balance_loss_mlp": 1.07074225, + "epoch": 0.1140823393612928, + "flos": 740069540352.0, + "grad_norm": 0.06581053360364857, + "language_loss": 0.8595314, + "learning_rate": 0.0009815790492016295, + "loss": 0.87054944, + "num_input_tokens_seen": 48895808, + "router_z_loss_mlp": 0.31030273, + "step": 593, + "time_per_iteration": 2.9602174758911133 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097875, + "balance_loss_mlp": 1.06666636, + "epoch": 0.11427472104655637, + "flos": 698694211584.0, + "grad_norm": 0.07124053574400792, + "language_loss": 0.87982339, + "learning_rate": 0.0009814951708363993, + "loss": 0.89080215, + "num_input_tokens_seen": 48967456, + "router_z_loss_mlp": 0.31201172, + "step": 594, + "time_per_iteration": 2.818460702896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01167391, + "balance_loss_mlp": 1.15413451, + "epoch": 0.11446710273181993, + "flos": 1476387952128.0, + "grad_norm": 0.04038129773095179, + "language_loss": 0.77990985, + "learning_rate": 0.0009814111055381654, + "loss": 0.79158378, + "num_input_tokens_seen": 49193152, + "router_z_loss_mlp": 0.1328125, + "step": 595, + "time_per_iteration": 4.776912450790405 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110386, + "balance_loss_mlp": 1.07250798, + "epoch": 0.1146594844170835, + "flos": 494641603584.0, + "grad_norm": 0.1404346857169784, + "language_loss": 0.89489102, + "learning_rate": 0.0009813268533395648, + "loss": 0.90592968, + "num_input_tokens_seen": 49260960, + "router_z_loss_mlp": 0.3137207, + "step": 596, + "time_per_iteration": 2.562816858291626 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115324, + "balance_loss_mlp": 1.08344746, + "epoch": 0.11485186610234706, + "flos": 474596319744.0, + "grad_norm": 0.07456374098915484, + "language_loss": 0.89145029, + "learning_rate": 0.0009812424142733073, + "loss": 0.90260351, + "num_input_tokens_seen": 49327616, + "router_z_loss_mlp": 0.31884766, + "step": 597, + "time_per_iteration": 2.5198655128479004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123971, + "balance_loss_mlp": 1.0946219, + "epoch": 0.11504424778761062, + "flos": 730858014720.0, + "grad_norm": 0.05033183127205697, + "language_loss": 0.86898923, + "learning_rate": 0.000981157788372175, + "loss": 0.88022888, + "num_input_tokens_seen": 49412864, + "router_z_loss_mlp": 0.29345703, + "step": 598, + "time_per_iteration": 3.004558563232422 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01155063, + "balance_loss_mlp": 1.12290049, + "epoch": 0.11523662947287418, + "flos": 545539788288.0, + "grad_norm": 0.07554757352201513, + "language_loss": 0.90216064, + "learning_rate": 0.0009810729756690223, + "loss": 0.91371131, + "num_input_tokens_seen": 49483584, + "router_z_loss_mlp": 0.3215332, + "step": 599, + "time_per_iteration": 2.7165520191192627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149643, + "balance_loss_mlp": 1.11790919, + "epoch": 0.11542901115813775, + "flos": 774737436672.0, + "grad_norm": 0.08801397326806587, + "language_loss": 0.92855275, + "learning_rate": 0.0009809879761967766, + "loss": 0.94004917, + "num_input_tokens_seen": 49563568, + "router_z_loss_mlp": 0.31738281, + "step": 600, + "time_per_iteration": 2.9548492431640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115619, + "balance_loss_mlp": 1.12004542, + "epoch": 0.11562139284340131, + "flos": 730585972224.0, + "grad_norm": 0.08285308963026158, + "language_loss": 0.87716347, + "learning_rate": 0.0009809027899884378, + "loss": 0.8887254, + "num_input_tokens_seen": 49640800, + "router_z_loss_mlp": 0.36157227, + "step": 601, + "time_per_iteration": 2.9107346534729004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131924, + "balance_loss_mlp": 1.10085821, + "epoch": 0.11581377452866487, + "flos": 535589148672.0, + "grad_norm": 0.07059046613839054, + "language_loss": 0.89834028, + "learning_rate": 0.0009808174170770779, + "loss": 0.90965956, + "num_input_tokens_seen": 49721872, + "router_z_loss_mlp": 0.31079102, + "step": 602, + "time_per_iteration": 2.79127836227417 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01217718, + "balance_loss_mlp": 1.20541608, + "epoch": 0.11600615621392843, + "flos": 1554968613888.0, + "grad_norm": 0.07528653751738872, + "language_loss": 0.84898245, + "learning_rate": 0.0009807318574958418, + "loss": 0.86115962, + "num_input_tokens_seen": 49951472, + "router_z_loss_mlp": 0.12304688, + "step": 603, + "time_per_iteration": 4.862261772155762 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103115, + "balance_loss_mlp": 1.07238269, + "epoch": 0.116198537899192, + "flos": 537178037760.0, + "grad_norm": 0.08106568577848162, + "language_loss": 0.94434869, + "learning_rate": 0.0009806461112779462, + "loss": 0.95537978, + "num_input_tokens_seen": 50021136, + "router_z_loss_mlp": 0.30737305, + "step": 604, + "time_per_iteration": 2.600008249282837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097342, + "balance_loss_mlp": 1.06427336, + "epoch": 0.11639091958445556, + "flos": 453970483200.0, + "grad_norm": 0.09761910402267754, + "language_loss": 0.89590895, + "learning_rate": 0.0009805601784566814, + "loss": 0.90688241, + "num_input_tokens_seen": 50083888, + "router_z_loss_mlp": 0.33056641, + "step": 605, + "time_per_iteration": 2.4687013626098633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097807, + "balance_loss_mlp": 1.06635928, + "epoch": 0.11658330126971912, + "flos": 554815332864.0, + "grad_norm": 0.0628453025897625, + "language_loss": 0.96235836, + "learning_rate": 0.0009804740590654089, + "loss": 0.97333646, + "num_input_tokens_seen": 50151744, + "router_z_loss_mlp": 0.31469727, + "step": 606, + "time_per_iteration": 2.654134750366211 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109435, + "balance_loss_mlp": 1.0789417, + "epoch": 0.11677568295498268, + "flos": 716025968640.0, + "grad_norm": 0.07837472156111998, + "language_loss": 0.90884066, + "learning_rate": 0.0009803877531375635, + "loss": 0.91993499, + "num_input_tokens_seen": 50221248, + "router_z_loss_mlp": 0.30493164, + "step": 607, + "time_per_iteration": 2.825778007507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112529, + "balance_loss_mlp": 1.08074808, + "epoch": 0.11696806464024626, + "flos": 609474668544.0, + "grad_norm": 0.07263848878870109, + "language_loss": 0.91923869, + "learning_rate": 0.0009803012607066523, + "loss": 0.93036401, + "num_input_tokens_seen": 50293792, + "router_z_loss_mlp": 0.31787109, + "step": 608, + "time_per_iteration": 2.721005916595459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101062, + "balance_loss_mlp": 1.06980491, + "epoch": 0.11716044632550981, + "flos": 520127087616.0, + "grad_norm": 0.06980646294906427, + "language_loss": 0.9077962, + "learning_rate": 0.0009802145818062543, + "loss": 0.91880679, + "num_input_tokens_seen": 50367760, + "router_z_loss_mlp": 0.31225586, + "step": 609, + "time_per_iteration": 2.707643985748291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102401, + "balance_loss_mlp": 1.07035792, + "epoch": 0.11735282801077337, + "flos": 507246133248.0, + "grad_norm": 0.07162886221417876, + "language_loss": 0.9293434, + "learning_rate": 0.0009801277164700212, + "loss": 0.9403674, + "num_input_tokens_seen": 50435664, + "router_z_loss_mlp": 0.3203125, + "step": 610, + "time_per_iteration": 2.6389639377593994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094537, + "balance_loss_mlp": 1.06323278, + "epoch": 0.11754520969603693, + "flos": 686339965440.0, + "grad_norm": 0.07220465483683103, + "language_loss": 0.90727574, + "learning_rate": 0.0009800406647316776, + "loss": 0.91822106, + "num_input_tokens_seen": 50514144, + "router_z_loss_mlp": 0.31274414, + "step": 611, + "time_per_iteration": 2.8033382892608643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01066854, + "balance_loss_mlp": 1.05369329, + "epoch": 0.1177375913813005, + "flos": 1541673022464.0, + "grad_norm": 0.030783707978337852, + "language_loss": 0.76914459, + "learning_rate": 0.0009799534266250196, + "loss": 0.77981311, + "num_input_tokens_seen": 50738448, + "router_z_loss_mlp": 0.13183594, + "step": 612, + "time_per_iteration": 4.777275562286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116404, + "balance_loss_mlp": 1.08307314, + "epoch": 0.11792997306656407, + "flos": 520269682176.0, + "grad_norm": 0.07589987368124408, + "language_loss": 0.8961159, + "learning_rate": 0.000979866002183916, + "loss": 0.90727997, + "num_input_tokens_seen": 50809328, + "router_z_loss_mlp": 0.33325195, + "step": 613, + "time_per_iteration": 2.6848883628845215 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109453, + "balance_loss_mlp": 1.07719529, + "epoch": 0.11812235475182763, + "flos": 665980379136.0, + "grad_norm": 0.08667718058784188, + "language_loss": 0.91197205, + "learning_rate": 0.0009797783914423082, + "loss": 0.92306662, + "num_input_tokens_seen": 50887728, + "router_z_loss_mlp": 0.32250977, + "step": 614, + "time_per_iteration": 2.832414388656616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105189, + "balance_loss_mlp": 1.07140493, + "epoch": 0.11831473643709119, + "flos": 621021399552.0, + "grad_norm": 0.06050640051516142, + "language_loss": 0.85425436, + "learning_rate": 0.0009796905944342094, + "loss": 0.86530626, + "num_input_tokens_seen": 50966160, + "router_z_loss_mlp": 0.33813477, + "step": 615, + "time_per_iteration": 2.8220455646514893 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112849, + "balance_loss_mlp": 1.07913685, + "epoch": 0.11850711812235475, + "flos": 456438108672.0, + "grad_norm": 0.0714748534502384, + "language_loss": 0.893188, + "learning_rate": 0.0009796026111937057, + "loss": 0.90431643, + "num_input_tokens_seen": 51035712, + "router_z_loss_mlp": 0.3371582, + "step": 616, + "time_per_iteration": 2.590566873550415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102524, + "balance_loss_mlp": 1.07005119, + "epoch": 0.11869949980761832, + "flos": 513598565376.0, + "grad_norm": 0.06492309219220607, + "language_loss": 0.89778733, + "learning_rate": 0.0009795144417549552, + "loss": 0.90881252, + "num_input_tokens_seen": 51108656, + "router_z_loss_mlp": 0.32470703, + "step": 617, + "time_per_iteration": 2.672914505004883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109626, + "balance_loss_mlp": 1.0773685, + "epoch": 0.11889188149288188, + "flos": 534732171264.0, + "grad_norm": 0.057544425945024125, + "language_loss": 0.90660846, + "learning_rate": 0.0009794260861521883, + "loss": 0.9177047, + "num_input_tokens_seen": 51185552, + "router_z_loss_mlp": 0.32250977, + "step": 618, + "time_per_iteration": 2.817354202270508 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102516, + "balance_loss_mlp": 1.07009149, + "epoch": 0.11908426317814544, + "flos": 498344527872.0, + "grad_norm": 0.0773697745436404, + "language_loss": 0.87738883, + "learning_rate": 0.0009793375444197075, + "loss": 0.88841403, + "num_input_tokens_seen": 51255808, + "router_z_loss_mlp": 0.32397461, + "step": 619, + "time_per_iteration": 2.607475996017456 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011109, + "balance_loss_mlp": 1.07697332, + "epoch": 0.119276644863409, + "flos": 659598833664.0, + "grad_norm": 0.06767977381214116, + "language_loss": 0.86337721, + "learning_rate": 0.000979248816591888, + "loss": 0.87448615, + "num_input_tokens_seen": 51329408, + "router_z_loss_mlp": 0.33935547, + "step": 620, + "time_per_iteration": 2.758866548538208 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098794, + "balance_loss_mlp": 1.06667948, + "epoch": 0.11946902654867257, + "flos": 758396487168.0, + "grad_norm": 0.06819106164994826, + "language_loss": 0.87032986, + "learning_rate": 0.0009791599027031766, + "loss": 0.88131785, + "num_input_tokens_seen": 51408784, + "router_z_loss_mlp": 0.32128906, + "step": 621, + "time_per_iteration": 3.029431104660034 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088156, + "balance_loss_mlp": 1.05611241, + "epoch": 0.11966140823393613, + "flos": 680697533952.0, + "grad_norm": 0.0732554324646167, + "language_loss": 0.87112588, + "learning_rate": 0.0009790708027880932, + "loss": 0.88200748, + "num_input_tokens_seen": 51482592, + "router_z_loss_mlp": 0.32055664, + "step": 622, + "time_per_iteration": 2.855576992034912 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01056461, + "balance_loss_mlp": 1.04444504, + "epoch": 0.11985378991919969, + "flos": 1450268070912.0, + "grad_norm": 0.03732324883573809, + "language_loss": 0.77427292, + "learning_rate": 0.0009789815168812293, + "loss": 0.78483754, + "num_input_tokens_seen": 51712240, + "router_z_loss_mlp": 0.12011719, + "step": 623, + "time_per_iteration": 4.840993165969849 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108671, + "balance_loss_mlp": 1.0551914, + "epoch": 0.12004617160446325, + "flos": 527586780672.0, + "grad_norm": 0.07309096746678648, + "language_loss": 0.94236648, + "learning_rate": 0.0009788920450172487, + "loss": 0.9532336, + "num_input_tokens_seen": 51781440, + "router_z_loss_mlp": 0.31518555, + "step": 624, + "time_per_iteration": 2.6301677227020264 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102663, + "balance_loss_mlp": 1.07023823, + "epoch": 0.12023855328972682, + "flos": 473980861440.0, + "grad_norm": 0.15739190650861204, + "language_loss": 0.91515559, + "learning_rate": 0.0009788023872308875, + "loss": 0.92618221, + "num_input_tokens_seen": 51845424, + "router_z_loss_mlp": 0.32421875, + "step": 625, + "time_per_iteration": 2.506446361541748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01014454, + "balance_loss_mlp": 1.0033915, + "epoch": 0.12043093497499038, + "flos": 1530954155520.0, + "grad_norm": 0.02216054665264375, + "language_loss": 0.75428998, + "learning_rate": 0.0009787125435569539, + "loss": 0.76443458, + "num_input_tokens_seen": 52076496, + "router_z_loss_mlp": 0.11083984, + "step": 626, + "time_per_iteration": 4.713289260864258 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114644, + "balance_loss_mlp": 1.08391225, + "epoch": 0.12062331666025394, + "flos": 539571469824.0, + "grad_norm": 0.0672242080300053, + "language_loss": 0.94766486, + "learning_rate": 0.0009786225140303285, + "loss": 0.95881128, + "num_input_tokens_seen": 52143072, + "router_z_loss_mlp": 0.30761719, + "step": 627, + "time_per_iteration": 2.61875057220459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011267, + "balance_loss_mlp": 1.09503818, + "epoch": 0.1208156983455175, + "flos": 511634327040.0, + "grad_norm": 0.06510849521455, + "language_loss": 0.925771, + "learning_rate": 0.0009785322986859634, + "loss": 0.93703806, + "num_input_tokens_seen": 52211888, + "router_z_loss_mlp": 0.31640625, + "step": 628, + "time_per_iteration": 2.6567625999450684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141777, + "balance_loss_mlp": 1.11059177, + "epoch": 0.12100808003078108, + "flos": 596195043840.0, + "grad_norm": 0.06735600063735754, + "language_loss": 0.93719506, + "learning_rate": 0.0009784418975588838, + "loss": 0.94861281, + "num_input_tokens_seen": 52283696, + "router_z_loss_mlp": 0.31152344, + "step": 629, + "time_per_iteration": 2.697376012802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122983, + "balance_loss_mlp": 1.09222674, + "epoch": 0.12120046171604464, + "flos": 522698019840.0, + "grad_norm": 0.47103484407124013, + "language_loss": 0.93927598, + "learning_rate": 0.0009783513106841862, + "loss": 0.95050573, + "num_input_tokens_seen": 52358624, + "router_z_loss_mlp": 0.30761719, + "step": 630, + "time_per_iteration": 2.7226808071136475 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143332, + "balance_loss_mlp": 1.13179243, + "epoch": 0.1213928434013082, + "flos": 1553605277184.0, + "grad_norm": 0.056788624646596834, + "language_loss": 0.76732707, + "learning_rate": 0.00097826053809704, + "loss": 0.77876031, + "num_input_tokens_seen": 52591248, + "router_z_loss_mlp": 0.11523438, + "step": 631, + "time_per_iteration": 4.948111295700073 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01228128, + "balance_loss_mlp": 1.19219875, + "epoch": 0.12158522508657175, + "flos": 495143580672.0, + "grad_norm": 0.06834333100250278, + "language_loss": 0.88515621, + "learning_rate": 0.0009781695798326854, + "loss": 0.89743745, + "num_input_tokens_seen": 52659920, + "router_z_loss_mlp": 0.35961914, + "step": 632, + "time_per_iteration": 2.5616555213928223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267845, + "balance_loss_mlp": 1.23050833, + "epoch": 0.12177760677183531, + "flos": 475335433728.0, + "grad_norm": 0.1009303482431908, + "language_loss": 0.88543177, + "learning_rate": 0.0009780784359264365, + "loss": 0.89811015, + "num_input_tokens_seen": 52728832, + "router_z_loss_mlp": 0.37329102, + "step": 633, + "time_per_iteration": 2.597935438156128 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01265484, + "balance_loss_mlp": 1.25370574, + "epoch": 0.12196998845709889, + "flos": 1467630351360.0, + "grad_norm": 0.08843071113371018, + "language_loss": 0.74188697, + "learning_rate": 0.0009779871064136778, + "loss": 0.75454181, + "num_input_tokens_seen": 52949776, + "router_z_loss_mlp": 0.11767578, + "step": 634, + "time_per_iteration": 4.768415451049805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01235432, + "balance_loss_mlp": 1.19976473, + "epoch": 0.12216237014236245, + "flos": 586279309824.0, + "grad_norm": 0.0829698455775257, + "language_loss": 0.88074899, + "learning_rate": 0.000977895591329867, + "loss": 0.89310336, + "num_input_tokens_seen": 53027184, + "router_z_loss_mlp": 0.35668945, + "step": 635, + "time_per_iteration": 2.7918457984924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01214994, + "balance_loss_mlp": 1.17720437, + "epoch": 0.12235475182762601, + "flos": 597721324032.0, + "grad_norm": 0.0916527997361875, + "language_loss": 0.87791145, + "learning_rate": 0.000977803890710533, + "loss": 0.89006138, + "num_input_tokens_seen": 53101072, + "router_z_loss_mlp": 0.37792969, + "step": 636, + "time_per_iteration": 2.7248313426971436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01186705, + "balance_loss_mlp": 1.1509428, + "epoch": 0.12254713351288957, + "flos": 497487550464.0, + "grad_norm": 0.0702522126388857, + "language_loss": 0.93856937, + "learning_rate": 0.0009777120045912774, + "loss": 0.95043641, + "num_input_tokens_seen": 53172992, + "router_z_loss_mlp": 0.35766602, + "step": 637, + "time_per_iteration": 2.6079726219177246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01180704, + "balance_loss_mlp": 1.14236617, + "epoch": 0.12273951519815314, + "flos": 605565130752.0, + "grad_norm": 0.06645311005239844, + "language_loss": 0.90599251, + "learning_rate": 0.0009776199330077736, + "loss": 0.91779959, + "num_input_tokens_seen": 53248256, + "router_z_loss_mlp": 0.38330078, + "step": 638, + "time_per_iteration": 2.7671282291412354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01196025, + "balance_loss_mlp": 1.15940344, + "epoch": 0.1229318968834167, + "flos": 597578729472.0, + "grad_norm": 0.09015200479441979, + "language_loss": 0.93140519, + "learning_rate": 0.0009775276759957667, + "loss": 0.94336545, + "num_input_tokens_seen": 53318960, + "router_z_loss_mlp": 0.36621094, + "step": 639, + "time_per_iteration": 2.6990442276000977 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01179898, + "balance_loss_mlp": 1.14265716, + "epoch": 0.12312427856868026, + "flos": 678082931712.0, + "grad_norm": 0.08188642922116089, + "language_loss": 0.90714514, + "learning_rate": 0.0009774352335910745, + "loss": 0.91894412, + "num_input_tokens_seen": 53389120, + "router_z_loss_mlp": 0.37280273, + "step": 640, + "time_per_iteration": 2.7950265407562256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0115004, + "balance_loss_mlp": 1.11658967, + "epoch": 0.12331666025394382, + "flos": 608656978944.0, + "grad_norm": 0.07361380744806716, + "language_loss": 0.95549798, + "learning_rate": 0.000977342605829586, + "loss": 0.96699834, + "num_input_tokens_seen": 53459056, + "router_z_loss_mlp": 0.3347168, + "step": 641, + "time_per_iteration": 2.6966538429260254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01140018, + "balance_loss_mlp": 1.10497069, + "epoch": 0.12350904193920739, + "flos": 762172194816.0, + "grad_norm": 0.08211004604029591, + "language_loss": 0.86708105, + "learning_rate": 0.0009772497927472623, + "loss": 0.87848121, + "num_input_tokens_seen": 53541552, + "router_z_loss_mlp": 0.35083008, + "step": 642, + "time_per_iteration": 3.050595998764038 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121507, + "balance_loss_mlp": 1.0852437, + "epoch": 0.12370142362447095, + "flos": 540699079680.0, + "grad_norm": 0.0716743258864478, + "language_loss": 0.85363436, + "learning_rate": 0.0009771567943801368, + "loss": 0.86484945, + "num_input_tokens_seen": 53611520, + "router_z_loss_mlp": 0.36254883, + "step": 643, + "time_per_iteration": 2.627019166946411 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112067, + "balance_loss_mlp": 1.07744884, + "epoch": 0.12389380530973451, + "flos": 547848852480.0, + "grad_norm": 0.06992166814052157, + "language_loss": 0.89936745, + "learning_rate": 0.0009770636107643152, + "loss": 0.91048813, + "num_input_tokens_seen": 53683888, + "router_z_loss_mlp": 0.34643555, + "step": 644, + "time_per_iteration": 2.696233034133911 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102963, + "balance_loss_mlp": 1.06846356, + "epoch": 0.12408618699499807, + "flos": 540048715776.0, + "grad_norm": 0.06268128655507912, + "language_loss": 0.88181639, + "learning_rate": 0.0009769702419359738, + "loss": 0.89284605, + "num_input_tokens_seen": 53751888, + "router_z_loss_mlp": 0.3449707, + "step": 645, + "time_per_iteration": 2.61401104927063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116268, + "balance_loss_mlp": 1.0810535, + "epoch": 0.12427856868026164, + "flos": 745451513856.0, + "grad_norm": 0.07610574883038115, + "language_loss": 0.89730537, + "learning_rate": 0.000976876687931362, + "loss": 0.90846807, + "num_input_tokens_seen": 53827648, + "router_z_loss_mlp": 0.35229492, + "step": 646, + "time_per_iteration": 2.999408721923828 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131315, + "balance_loss_mlp": 1.09622002, + "epoch": 0.1244709503655252, + "flos": 533460556800.0, + "grad_norm": 0.19449531308307466, + "language_loss": 0.85410094, + "learning_rate": 0.0009767829487868005, + "loss": 0.86541414, + "num_input_tokens_seen": 53896400, + "router_z_loss_mlp": 0.35107422, + "step": 647, + "time_per_iteration": 2.617666721343994 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138117, + "balance_loss_mlp": 1.10159075, + "epoch": 0.12466333205078876, + "flos": 507847034880.0, + "grad_norm": 0.07509451505155453, + "language_loss": 0.89358151, + "learning_rate": 0.000976689024538682, + "loss": 0.90496266, + "num_input_tokens_seen": 53965904, + "router_z_loss_mlp": 0.36499023, + "step": 648, + "time_per_iteration": 2.5929009914398193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138062, + "balance_loss_mlp": 1.10110736, + "epoch": 0.12485571373605232, + "flos": 681023420928.0, + "grad_norm": 0.07057439208121223, + "language_loss": 0.87662494, + "learning_rate": 0.0009765949152234716, + "loss": 0.8880055, + "num_input_tokens_seen": 54049792, + "router_z_loss_mlp": 0.36962891, + "step": 649, + "time_per_iteration": 2.874701976776123 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147728, + "balance_loss_mlp": 1.13504386, + "epoch": 0.1250480954213159, + "flos": 1329402668544.0, + "grad_norm": 0.04527818124304351, + "language_loss": 0.78686082, + "learning_rate": 0.0009765006208777055, + "loss": 0.79833812, + "num_input_tokens_seen": 54262432, + "router_z_loss_mlp": 0.12695312, + "step": 650, + "time_per_iteration": 4.680933713912964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01138039, + "balance_loss_mlp": 1.10287213, + "epoch": 0.12524047710657946, + "flos": 938140683264.0, + "grad_norm": 0.08375968037938068, + "language_loss": 0.82443976, + "learning_rate": 0.0009764061415379919, + "loss": 0.83582014, + "num_input_tokens_seen": 54351568, + "router_z_loss_mlp": 0.35205078, + "step": 651, + "time_per_iteration": 3.2550604343414307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01135369, + "balance_loss_mlp": 1.09774697, + "epoch": 0.12543285879184302, + "flos": 513642235392.0, + "grad_norm": 0.07146085627000143, + "language_loss": 0.89363486, + "learning_rate": 0.0009763114772410109, + "loss": 0.90498853, + "num_input_tokens_seen": 54418944, + "router_z_loss_mlp": 0.3762207, + "step": 652, + "time_per_iteration": 2.5937142372131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139745, + "balance_loss_mlp": 1.10419679, + "epoch": 0.12562524047710658, + "flos": 717991617024.0, + "grad_norm": 0.07913079577836896, + "language_loss": 0.87230957, + "learning_rate": 0.0009762166280235146, + "loss": 0.88370705, + "num_input_tokens_seen": 54495312, + "router_z_loss_mlp": 0.35571289, + "step": 653, + "time_per_iteration": 2.96162748336792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01147653, + "balance_loss_mlp": 1.10974443, + "epoch": 0.12581762216237014, + "flos": 563441923584.0, + "grad_norm": 0.06492259826928527, + "language_loss": 0.87890899, + "learning_rate": 0.0009761215939223267, + "loss": 0.89038551, + "num_input_tokens_seen": 54566832, + "router_z_loss_mlp": 0.37890625, + "step": 654, + "time_per_iteration": 2.714641809463501 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145182, + "balance_loss_mlp": 1.1077261, + "epoch": 0.1260100038476337, + "flos": 481642785792.0, + "grad_norm": 0.07920721431290144, + "language_loss": 0.86875665, + "learning_rate": 0.0009760263749743428, + "loss": 0.88020849, + "num_input_tokens_seen": 54632128, + "router_z_loss_mlp": 0.37426758, + "step": 655, + "time_per_iteration": 2.547499179840088 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145343, + "balance_loss_mlp": 1.11074805, + "epoch": 0.12620238553289725, + "flos": 575269461504.0, + "grad_norm": 0.06357383816141966, + "language_loss": 0.90176344, + "learning_rate": 0.0009759309712165299, + "loss": 0.91321695, + "num_input_tokens_seen": 54707600, + "router_z_loss_mlp": 0.34570312, + "step": 656, + "time_per_iteration": 2.693922996520996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137509, + "balance_loss_mlp": 1.103248, + "epoch": 0.12639476721816084, + "flos": 530909973504.0, + "grad_norm": 0.07169490366111804, + "language_loss": 0.93258119, + "learning_rate": 0.0009758353826859272, + "loss": 0.94395626, + "num_input_tokens_seen": 54776704, + "router_z_loss_mlp": 0.34277344, + "step": 657, + "time_per_iteration": 2.5744612216949463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01139269, + "balance_loss_mlp": 1.10314822, + "epoch": 0.1265871489034244, + "flos": 689654393856.0, + "grad_norm": 0.06860158128637554, + "language_loss": 0.89679217, + "learning_rate": 0.0009757396094196456, + "loss": 0.90818477, + "num_input_tokens_seen": 54851744, + "router_z_loss_mlp": 0.36132812, + "step": 658, + "time_per_iteration": 2.851700782775879 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01143308, + "balance_loss_mlp": 1.10675859, + "epoch": 0.12677953058868796, + "flos": 536863735296.0, + "grad_norm": 0.0696485175834739, + "language_loss": 0.84555894, + "learning_rate": 0.0009756436514548673, + "loss": 0.85699201, + "num_input_tokens_seen": 54932576, + "router_z_loss_mlp": 0.36523438, + "step": 659, + "time_per_iteration": 2.7971351146698 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122244, + "balance_loss_mlp": 1.08800757, + "epoch": 0.12697191227395152, + "flos": 518749194240.0, + "grad_norm": 0.05327633329409036, + "language_loss": 0.88343394, + "learning_rate": 0.0009755475088288466, + "loss": 0.89465636, + "num_input_tokens_seen": 55007296, + "router_z_loss_mlp": 0.34228516, + "step": 660, + "time_per_iteration": 2.670555353164673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103127, + "balance_loss_mlp": 1.06903291, + "epoch": 0.12716429395921508, + "flos": 566341714944.0, + "grad_norm": 0.06801254087798507, + "language_loss": 0.90210187, + "learning_rate": 0.0009754511815789095, + "loss": 0.91313314, + "num_input_tokens_seen": 55079312, + "router_z_loss_mlp": 0.34106445, + "step": 661, + "time_per_iteration": 2.748224973678589 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102552, + "balance_loss_mlp": 1.06798172, + "epoch": 0.12735667564447864, + "flos": 513844466688.0, + "grad_norm": 0.06975204014846512, + "language_loss": 0.86245489, + "learning_rate": 0.0009753546697424533, + "loss": 0.87348044, + "num_input_tokens_seen": 55151824, + "router_z_loss_mlp": 0.34594727, + "step": 662, + "time_per_iteration": 2.664799213409424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092108, + "balance_loss_mlp": 1.05863369, + "epoch": 0.1275490573297422, + "flos": 541023556608.0, + "grad_norm": 0.05485824904298714, + "language_loss": 0.90572149, + "learning_rate": 0.0009752579733569475, + "loss": 0.91664255, + "num_input_tokens_seen": 55224368, + "router_z_loss_mlp": 0.3347168, + "step": 663, + "time_per_iteration": 2.679975748062134 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01267369, + "balance_loss_mlp": 1.2515384, + "epoch": 0.12774143901500576, + "flos": 1557872787456.0, + "grad_norm": 0.0685532780556388, + "language_loss": 0.74881387, + "learning_rate": 0.0009751610924599328, + "loss": 0.7614876, + "num_input_tokens_seen": 55453584, + "router_z_loss_mlp": 0.15820312, + "step": 664, + "time_per_iteration": 4.938101053237915 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096151, + "balance_loss_mlp": 1.06177139, + "epoch": 0.12793382070026935, + "flos": 613462781952.0, + "grad_norm": 0.06920677464457729, + "language_loss": 0.90523887, + "learning_rate": 0.0009750640270890217, + "loss": 0.9162004, + "num_input_tokens_seen": 55528000, + "router_z_loss_mlp": 0.34375, + "step": 665, + "time_per_iteration": 2.6939845085144043 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099083, + "balance_loss_mlp": 1.06563258, + "epoch": 0.1281262023855329, + "flos": 707386231296.0, + "grad_norm": 0.06773970450457005, + "language_loss": 0.96531481, + "learning_rate": 0.0009749667772818983, + "loss": 0.9763056, + "num_input_tokens_seen": 55612416, + "router_z_loss_mlp": 0.33447266, + "step": 666, + "time_per_iteration": 2.967853307723999 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01164762, + "balance_loss_mlp": 1.15131497, + "epoch": 0.12831858407079647, + "flos": 1424250086400.0, + "grad_norm": 0.045177828452490555, + "language_loss": 0.76935941, + "learning_rate": 0.0009748693430763185, + "loss": 0.78100705, + "num_input_tokens_seen": 55843664, + "router_z_loss_mlp": 0.13476562, + "step": 667, + "time_per_iteration": 4.85069465637207 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093582, + "balance_loss_mlp": 1.05958366, + "epoch": 0.12851096575606002, + "flos": 448869316608.0, + "grad_norm": 0.07778909975942494, + "language_loss": 0.95426726, + "learning_rate": 0.0009747717245101093, + "loss": 0.96520311, + "num_input_tokens_seen": 55909072, + "router_z_loss_mlp": 0.34008789, + "step": 668, + "time_per_iteration": 2.5234692096710205 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098998, + "balance_loss_mlp": 1.06519032, + "epoch": 0.12870334744132358, + "flos": 479697486336.0, + "grad_norm": 0.05465485885236262, + "language_loss": 0.84969366, + "learning_rate": 0.00097467392162117, + "loss": 0.86068368, + "num_input_tokens_seen": 55978544, + "router_z_loss_mlp": 0.33789062, + "step": 669, + "time_per_iteration": 2.601684808731079 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096385, + "balance_loss_mlp": 1.06341171, + "epoch": 0.12889572912658714, + "flos": 638633963520.0, + "grad_norm": 0.05954757179165737, + "language_loss": 0.91292465, + "learning_rate": 0.0009745759344474708, + "loss": 0.92388856, + "num_input_tokens_seen": 56054144, + "router_z_loss_mlp": 0.32983398, + "step": 670, + "time_per_iteration": 2.8225347995758057 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098806, + "balance_loss_mlp": 1.06411648, + "epoch": 0.1290881108118507, + "flos": 509693409792.0, + "grad_norm": 0.06976130099981656, + "language_loss": 0.89229816, + "learning_rate": 0.0009744777630270536, + "loss": 0.90328622, + "num_input_tokens_seen": 56120960, + "router_z_loss_mlp": 0.34692383, + "step": 671, + "time_per_iteration": 2.633571147918701 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109875, + "balance_loss_mlp": 1.07435024, + "epoch": 0.12928049249711426, + "flos": 670746894336.0, + "grad_norm": 0.08011077975608555, + "language_loss": 0.93749923, + "learning_rate": 0.000974379407398032, + "loss": 0.94859791, + "num_input_tokens_seen": 56202560, + "router_z_loss_mlp": 0.35546875, + "step": 672, + "time_per_iteration": 2.875609874725342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093721, + "balance_loss_mlp": 1.06065273, + "epoch": 0.12947287418237785, + "flos": 793158925824.0, + "grad_norm": 0.05850057523774312, + "language_loss": 0.82016242, + "learning_rate": 0.0009742808675985913, + "loss": 0.83109969, + "num_input_tokens_seen": 56289456, + "router_z_loss_mlp": 0.33056641, + "step": 673, + "time_per_iteration": 3.087738275527954 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101029, + "balance_loss_mlp": 1.0646224, + "epoch": 0.1296652558676414, + "flos": 485222054400.0, + "grad_norm": 0.08954381825883409, + "language_loss": 0.9153564, + "learning_rate": 0.0009741821436669876, + "loss": 0.92636657, + "num_input_tokens_seen": 56354480, + "router_z_loss_mlp": 0.36450195, + "step": 674, + "time_per_iteration": 2.539849281311035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101326, + "balance_loss_mlp": 1.06673169, + "epoch": 0.12985763755290497, + "flos": 453226987008.0, + "grad_norm": 0.0648114016490977, + "language_loss": 0.9288274, + "learning_rate": 0.0009740832356415492, + "loss": 0.93984067, + "num_input_tokens_seen": 56418944, + "router_z_loss_mlp": 0.34619141, + "step": 675, + "time_per_iteration": 2.467801094055176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097673, + "balance_loss_mlp": 1.06315041, + "epoch": 0.13005001923816853, + "flos": 824719007232.0, + "grad_norm": 0.0735546441878898, + "language_loss": 0.8857609, + "learning_rate": 0.0009739841435606756, + "loss": 0.89673769, + "num_input_tokens_seen": 56492368, + "router_z_loss_mlp": 0.34545898, + "step": 676, + "time_per_iteration": 3.008781909942627 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109741, + "balance_loss_mlp": 1.06457949, + "epoch": 0.1302424009234321, + "flos": 531107822592.0, + "grad_norm": 0.07312926894822828, + "language_loss": 0.90675485, + "learning_rate": 0.0009738848674628377, + "loss": 0.9177289, + "num_input_tokens_seen": 56568128, + "router_z_loss_mlp": 0.328125, + "step": 677, + "time_per_iteration": 2.695338010787964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104607, + "balance_loss_mlp": 1.06955981, + "epoch": 0.13043478260869565, + "flos": 525626924544.0, + "grad_norm": 0.06033597827839572, + "language_loss": 0.89643902, + "learning_rate": 0.000973785407386578, + "loss": 0.90748513, + "num_input_tokens_seen": 56646448, + "router_z_loss_mlp": 0.35058594, + "step": 678, + "time_per_iteration": 2.7727599143981934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101976, + "balance_loss_mlp": 1.06714272, + "epoch": 0.1306271642939592, + "flos": 625862108160.0, + "grad_norm": 0.05570081952525763, + "language_loss": 0.87361526, + "learning_rate": 0.0009736857633705103, + "loss": 0.88463503, + "num_input_tokens_seen": 56732080, + "router_z_loss_mlp": 0.34814453, + "step": 679, + "time_per_iteration": 2.843129873275757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110176, + "balance_loss_mlp": 1.06630766, + "epoch": 0.13081954597922277, + "flos": 550438723584.0, + "grad_norm": 0.06405817655948583, + "language_loss": 0.93204647, + "learning_rate": 0.0009735859354533196, + "loss": 0.94306409, + "num_input_tokens_seen": 56804432, + "router_z_loss_mlp": 0.35473633, + "step": 680, + "time_per_iteration": 2.7122464179992676 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093118, + "balance_loss_mlp": 1.05914354, + "epoch": 0.13101192766448633, + "flos": 536651329536.0, + "grad_norm": 0.06779912020183775, + "language_loss": 0.91948998, + "learning_rate": 0.0009734859236737628, + "loss": 0.93042123, + "num_input_tokens_seen": 56872512, + "router_z_loss_mlp": 0.33984375, + "step": 681, + "time_per_iteration": 2.594881296157837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093326, + "balance_loss_mlp": 1.0593034, + "epoch": 0.13120430934974991, + "flos": 503258019840.0, + "grad_norm": 0.06413082246497326, + "language_loss": 0.93904501, + "learning_rate": 0.0009733857280706678, + "loss": 0.94997829, + "num_input_tokens_seen": 56940928, + "router_z_loss_mlp": 0.34033203, + "step": 682, + "time_per_iteration": 2.5831425189971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010929, + "balance_loss_mlp": 1.05992687, + "epoch": 0.13139669103501347, + "flos": 614014221312.0, + "grad_norm": 0.06246118190021366, + "language_loss": 0.85051745, + "learning_rate": 0.000973285348682934, + "loss": 0.86144638, + "num_input_tokens_seen": 57012736, + "router_z_loss_mlp": 0.33007812, + "step": 683, + "time_per_iteration": 2.7236225605010986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01226892, + "balance_loss_mlp": 1.21096563, + "epoch": 0.13158907272027703, + "flos": 1484163357696.0, + "grad_norm": 0.08359566880013784, + "language_loss": 0.77898371, + "learning_rate": 0.0009731847855495323, + "loss": 0.79125261, + "num_input_tokens_seen": 57243136, + "router_z_loss_mlp": 0.15917969, + "step": 684, + "time_per_iteration": 4.87854790687561 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087932, + "balance_loss_mlp": 1.05488706, + "epoch": 0.1317814544055406, + "flos": 985049344512.0, + "grad_norm": 0.07039095593234826, + "language_loss": 0.85449159, + "learning_rate": 0.0009730840387095046, + "loss": 0.86537099, + "num_input_tokens_seen": 57336160, + "router_z_loss_mlp": 0.33056641, + "step": 685, + "time_per_iteration": 3.30759596824646 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096943, + "balance_loss_mlp": 1.06156158, + "epoch": 0.13197383609080415, + "flos": 611163892224.0, + "grad_norm": 0.05759402546544749, + "language_loss": 0.912597, + "learning_rate": 0.0009729831082019642, + "loss": 0.92356646, + "num_input_tokens_seen": 57418976, + "router_z_loss_mlp": 0.35351562, + "step": 686, + "time_per_iteration": 2.7965087890625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093388, + "balance_loss_mlp": 1.0589608, + "epoch": 0.1321662177760677, + "flos": 494116305408.0, + "grad_norm": 0.058033147986452156, + "language_loss": 0.89668858, + "learning_rate": 0.0009728819940660958, + "loss": 0.90762246, + "num_input_tokens_seen": 57490288, + "router_z_loss_mlp": 0.34399414, + "step": 687, + "time_per_iteration": 2.7347469329833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110102, + "balance_loss_mlp": 1.0653528, + "epoch": 0.13235859946133127, + "flos": 495591713280.0, + "grad_norm": 0.07548862234195632, + "language_loss": 0.86088693, + "learning_rate": 0.0009727806963411557, + "loss": 0.87189722, + "num_input_tokens_seen": 57556064, + "router_z_loss_mlp": 0.35668945, + "step": 688, + "time_per_iteration": 2.621638774871826 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098222, + "balance_loss_mlp": 1.06279302, + "epoch": 0.13255098114659483, + "flos": 511417539072.0, + "grad_norm": 0.08656773393569435, + "language_loss": 0.88000298, + "learning_rate": 0.000972679215066471, + "loss": 0.89098513, + "num_input_tokens_seen": 57627248, + "router_z_loss_mlp": 0.35449219, + "step": 689, + "time_per_iteration": 2.6806418895721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103388, + "balance_loss_mlp": 1.06900764, + "epoch": 0.13274336283185842, + "flos": 547114120704.0, + "grad_norm": 0.07064056682134613, + "language_loss": 0.99675226, + "learning_rate": 0.0009725775502814401, + "loss": 1.00778604, + "num_input_tokens_seen": 57694832, + "router_z_loss_mlp": 0.34350586, + "step": 690, + "time_per_iteration": 2.607179641723633 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121046, + "balance_loss_mlp": 1.08397222, + "epoch": 0.13293574451712198, + "flos": 640465781760.0, + "grad_norm": 0.08777481913975324, + "language_loss": 0.85673726, + "learning_rate": 0.0009724757020255327, + "loss": 0.86794776, + "num_input_tokens_seen": 57771776, + "router_z_loss_mlp": 0.37084961, + "step": 691, + "time_per_iteration": 2.81113338470459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111244, + "balance_loss_mlp": 1.07726967, + "epoch": 0.13312812620238554, + "flos": 491234042880.0, + "grad_norm": 0.09165524457583717, + "language_loss": 0.87811983, + "learning_rate": 0.0009723736703382902, + "loss": 0.88923222, + "num_input_tokens_seen": 57836272, + "router_z_loss_mlp": 0.33984375, + "step": 692, + "time_per_iteration": 2.548689603805542 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110186, + "balance_loss_mlp": 1.0692203, + "epoch": 0.1333205078876491, + "flos": 508693837824.0, + "grad_norm": 0.061462060991887495, + "language_loss": 0.83746743, + "learning_rate": 0.0009722714552593244, + "loss": 0.84848601, + "num_input_tokens_seen": 57907232, + "router_z_loss_mlp": 0.32641602, + "step": 693, + "time_per_iteration": 2.6584513187408447 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099112, + "balance_loss_mlp": 1.06358743, + "epoch": 0.13351288957291266, + "flos": 418474722816.0, + "grad_norm": 0.07144638741394425, + "language_loss": 0.94810003, + "learning_rate": 0.000972169056828319, + "loss": 0.95909119, + "num_input_tokens_seen": 57969808, + "router_z_loss_mlp": 0.35522461, + "step": 694, + "time_per_iteration": 2.461437702178955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100772, + "balance_loss_mlp": 1.06751275, + "epoch": 0.13370527125817622, + "flos": 615614694912.0, + "grad_norm": 0.05672506947017021, + "language_loss": 0.87834966, + "learning_rate": 0.0009720664750850283, + "loss": 0.88935745, + "num_input_tokens_seen": 58042944, + "router_z_loss_mlp": 0.33251953, + "step": 695, + "time_per_iteration": 2.7716193199157715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103519, + "balance_loss_mlp": 1.07085609, + "epoch": 0.13389765294343978, + "flos": 625757391360.0, + "grad_norm": 0.07304651625724701, + "language_loss": 0.93482703, + "learning_rate": 0.0009719637100692784, + "loss": 0.94586229, + "num_input_tokens_seen": 58116080, + "router_z_loss_mlp": 0.32666016, + "step": 696, + "time_per_iteration": 2.7741310596466064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111476, + "balance_loss_mlp": 1.08090401, + "epoch": 0.13409003462870334, + "flos": 609391710720.0, + "grad_norm": 0.06235589965882817, + "language_loss": 0.83759153, + "learning_rate": 0.0009718607618209661, + "loss": 0.84873915, + "num_input_tokens_seen": 58197616, + "router_z_loss_mlp": 0.33862305, + "step": 697, + "time_per_iteration": 2.869180202484131 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01128671, + "balance_loss_mlp": 1.09488726, + "epoch": 0.13428241631396692, + "flos": 683499810816.0, + "grad_norm": 0.0709058406100417, + "language_loss": 0.88053036, + "learning_rate": 0.0009717576303800595, + "loss": 0.89181709, + "num_input_tokens_seen": 58280480, + "router_z_loss_mlp": 0.33789062, + "step": 698, + "time_per_iteration": 3.007253408432007 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122193, + "balance_loss_mlp": 1.08716917, + "epoch": 0.13447479799923048, + "flos": 508565799936.0, + "grad_norm": 0.07060238478807088, + "language_loss": 0.86057615, + "learning_rate": 0.0009716543157865975, + "loss": 0.87179804, + "num_input_tokens_seen": 58352464, + "router_z_loss_mlp": 0.35083008, + "step": 699, + "time_per_iteration": 2.6622114181518555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112811, + "balance_loss_mlp": 1.07812154, + "epoch": 0.13466717968449404, + "flos": 897124737024.0, + "grad_norm": 0.06896685381510245, + "language_loss": 0.84149206, + "learning_rate": 0.0009715508180806907, + "loss": 0.85262012, + "num_input_tokens_seen": 58437216, + "router_z_loss_mlp": 0.34716797, + "step": 700, + "time_per_iteration": 3.175494909286499 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106112, + "balance_loss_mlp": 1.07054055, + "epoch": 0.1348595613697576, + "flos": 989501557248.0, + "grad_norm": 0.07388845252403331, + "language_loss": 0.90260321, + "learning_rate": 0.0009714471373025202, + "loss": 0.91366434, + "num_input_tokens_seen": 58533152, + "router_z_loss_mlp": 0.35546875, + "step": 701, + "time_per_iteration": 3.3912835121154785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090254, + "balance_loss_mlp": 1.05499172, + "epoch": 0.13505194305502116, + "flos": 487580580864.0, + "grad_norm": 0.07959074518459132, + "language_loss": 0.89355272, + "learning_rate": 0.0009713432734923386, + "loss": 0.9044553, + "num_input_tokens_seen": 58601376, + "router_z_loss_mlp": 0.35253906, + "step": 702, + "time_per_iteration": 2.6718733310699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090728, + "balance_loss_mlp": 1.05572796, + "epoch": 0.13524432474028472, + "flos": 613103399424.0, + "grad_norm": 0.06387437846302528, + "language_loss": 0.875036, + "learning_rate": 0.0009712392266904696, + "loss": 0.88594317, + "num_input_tokens_seen": 58676608, + "router_z_loss_mlp": 0.34985352, + "step": 703, + "time_per_iteration": 2.6985831260681152 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.010863, + "balance_loss_mlp": 1.0524683, + "epoch": 0.13543670642554828, + "flos": 904425868800.0, + "grad_norm": 0.06666466963859687, + "language_loss": 0.86250496, + "learning_rate": 0.0009711349969373076, + "loss": 0.87336791, + "num_input_tokens_seen": 58759264, + "router_z_loss_mlp": 0.33862305, + "step": 704, + "time_per_iteration": 3.1328465938568115 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095762, + "balance_loss_mlp": 1.0610956, + "epoch": 0.13562908811081184, + "flos": 550335416832.0, + "grad_norm": 0.0628446006314887, + "language_loss": 0.80944061, + "learning_rate": 0.0009710305842733178, + "loss": 0.82039821, + "num_input_tokens_seen": 58834800, + "router_z_loss_mlp": 0.34667969, + "step": 705, + "time_per_iteration": 2.7668187618255615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093976, + "balance_loss_mlp": 1.06147909, + "epoch": 0.1358214697960754, + "flos": 507797572608.0, + "grad_norm": 0.06635154625105166, + "language_loss": 0.90133065, + "learning_rate": 0.0009709259887390373, + "loss": 0.91227043, + "num_input_tokens_seen": 58901712, + "router_z_loss_mlp": 0.32519531, + "step": 706, + "time_per_iteration": 2.656233072280884 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096924, + "balance_loss_mlp": 1.06390333, + "epoch": 0.136013851481339, + "flos": 528640197120.0, + "grad_norm": 0.09290535615143355, + "language_loss": 0.91425377, + "learning_rate": 0.0009708212103750737, + "loss": 0.92522299, + "num_input_tokens_seen": 58967824, + "router_z_loss_mlp": 0.33007812, + "step": 707, + "time_per_iteration": 2.569655656814575 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101147, + "balance_loss_mlp": 1.06812644, + "epoch": 0.13620623316660255, + "flos": 658772379648.0, + "grad_norm": 0.06731423560591156, + "language_loss": 0.87756282, + "learning_rate": 0.0009707162492221051, + "loss": 0.88857424, + "num_input_tokens_seen": 59045040, + "router_z_loss_mlp": 0.33007812, + "step": 708, + "time_per_iteration": 2.880669593811035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103707, + "balance_loss_mlp": 1.07009029, + "epoch": 0.1363986148518661, + "flos": 671583522816.0, + "grad_norm": 0.07312175328849302, + "language_loss": 0.88322687, + "learning_rate": 0.0009706111053208815, + "loss": 0.89426386, + "num_input_tokens_seen": 59117216, + "router_z_loss_mlp": 0.33642578, + "step": 709, + "time_per_iteration": 2.7878787517547607 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097257, + "balance_loss_mlp": 1.06342554, + "epoch": 0.13659099653712967, + "flos": 472828520448.0, + "grad_norm": 0.06741688104713542, + "language_loss": 0.86067665, + "learning_rate": 0.0009705057787122232, + "loss": 0.87164921, + "num_input_tokens_seen": 59183056, + "router_z_loss_mlp": 0.33862305, + "step": 710, + "time_per_iteration": 2.528298854827881 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105446, + "balance_loss_mlp": 1.07190061, + "epoch": 0.13678337822239323, + "flos": 452483490816.0, + "grad_norm": 0.05706590332145298, + "language_loss": 0.91653168, + "learning_rate": 0.0009704002694370216, + "loss": 0.92758614, + "num_input_tokens_seen": 59247312, + "router_z_loss_mlp": 0.33569336, + "step": 711, + "time_per_iteration": 2.5201761722564697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114394, + "balance_loss_mlp": 1.0794661, + "epoch": 0.13697575990765679, + "flos": 519373416960.0, + "grad_norm": 0.06387130477766731, + "language_loss": 0.86892813, + "learning_rate": 0.0009702945775362388, + "loss": 0.88007212, + "num_input_tokens_seen": 59317968, + "router_z_loss_mlp": 0.34960938, + "step": 712, + "time_per_iteration": 2.661848783493042 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130562, + "balance_loss_mlp": 1.0947994, + "epoch": 0.13716814159292035, + "flos": 480145618944.0, + "grad_norm": 0.06038249383316015, + "language_loss": 0.87339497, + "learning_rate": 0.0009701887030509086, + "loss": 0.8847006, + "num_input_tokens_seen": 59387936, + "router_z_loss_mlp": 0.35766602, + "step": 713, + "time_per_iteration": 2.6068434715270996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125148, + "balance_loss_mlp": 1.0908401, + "epoch": 0.1373605232781839, + "flos": 545376844800.0, + "grad_norm": 0.06924339631343991, + "language_loss": 0.92127877, + "learning_rate": 0.0009700826460221346, + "loss": 0.93253028, + "num_input_tokens_seen": 59460624, + "router_z_loss_mlp": 0.34301758, + "step": 714, + "time_per_iteration": 2.653224468231201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01145818, + "balance_loss_mlp": 1.11050797, + "epoch": 0.1375529049634475, + "flos": 708473143296.0, + "grad_norm": 0.0682346884445605, + "language_loss": 0.93435562, + "learning_rate": 0.0009699764064910921, + "loss": 0.94581378, + "num_input_tokens_seen": 59536752, + "router_z_loss_mlp": 0.35302734, + "step": 715, + "time_per_iteration": 2.878445625305176 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01130305, + "balance_loss_mlp": 1.09542441, + "epoch": 0.13774528664871105, + "flos": 486452971008.0, + "grad_norm": 0.07091873756636237, + "language_loss": 0.87931371, + "learning_rate": 0.0009698699844990268, + "loss": 0.89061677, + "num_input_tokens_seen": 59608128, + "router_z_loss_mlp": 0.34863281, + "step": 716, + "time_per_iteration": 2.6278092861175537 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124133, + "balance_loss_mlp": 1.09070659, + "epoch": 0.1379376683339746, + "flos": 679885636608.0, + "grad_norm": 0.0686032560828043, + "language_loss": 0.88731855, + "learning_rate": 0.0009697633800872555, + "loss": 0.89855987, + "num_input_tokens_seen": 59685120, + "router_z_loss_mlp": 0.33422852, + "step": 717, + "time_per_iteration": 2.888576030731201 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01112997, + "balance_loss_mlp": 1.07825947, + "epoch": 0.13813005001923817, + "flos": 610628419584.0, + "grad_norm": 0.07907714555147631, + "language_loss": 0.9128629, + "learning_rate": 0.0009696565932971655, + "loss": 0.92399287, + "num_input_tokens_seen": 59763376, + "router_z_loss_mlp": 0.34741211, + "step": 718, + "time_per_iteration": 2.8937225341796875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110249, + "balance_loss_mlp": 1.06837237, + "epoch": 0.13832243170450173, + "flos": 588431222784.0, + "grad_norm": 0.05947825646897862, + "language_loss": 0.9001984, + "learning_rate": 0.0009695496241702153, + "loss": 0.91122329, + "num_input_tokens_seen": 59836800, + "router_z_loss_mlp": 0.34155273, + "step": 719, + "time_per_iteration": 2.791111469268799 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094313, + "balance_loss_mlp": 1.06093454, + "epoch": 0.1385148133897653, + "flos": 699674844672.0, + "grad_norm": 0.07440757355955382, + "language_loss": 0.86308432, + "learning_rate": 0.0009694424727479339, + "loss": 0.87402749, + "num_input_tokens_seen": 59914720, + "router_z_loss_mlp": 0.33398438, + "step": 720, + "time_per_iteration": 2.8781325817108154 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088445, + "balance_loss_mlp": 1.05475688, + "epoch": 0.13870719507502885, + "flos": 597977399808.0, + "grad_norm": 0.059872525751604476, + "language_loss": 0.90073895, + "learning_rate": 0.0009693351390719213, + "loss": 0.91162348, + "num_input_tokens_seen": 59984544, + "router_z_loss_mlp": 0.3371582, + "step": 721, + "time_per_iteration": 2.691493272781372 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095999, + "balance_loss_mlp": 1.06242967, + "epoch": 0.1388995767602924, + "flos": 586279309824.0, + "grad_norm": 0.07792099406652078, + "language_loss": 0.91640067, + "learning_rate": 0.000969227623183848, + "loss": 0.92736065, + "num_input_tokens_seen": 60057056, + "router_z_loss_mlp": 0.33569336, + "step": 722, + "time_per_iteration": 2.768209218978882 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086676, + "balance_loss_mlp": 1.05475235, + "epoch": 0.139091958445556, + "flos": 650810709504.0, + "grad_norm": 0.07717859695455091, + "language_loss": 0.91485119, + "learning_rate": 0.0009691199251254554, + "loss": 0.92571795, + "num_input_tokens_seen": 60133232, + "router_z_loss_mlp": 0.3190918, + "step": 723, + "time_per_iteration": 2.813594102859497 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093708, + "balance_loss_mlp": 1.06159282, + "epoch": 0.13928434013081956, + "flos": 575446961664.0, + "grad_norm": 0.06414169604653322, + "language_loss": 0.8718468, + "learning_rate": 0.0009690120449385555, + "loss": 0.88278389, + "num_input_tokens_seen": 60207104, + "router_z_loss_mlp": 0.32104492, + "step": 724, + "time_per_iteration": 2.732372999191284 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110008, + "balance_loss_mlp": 1.06574821, + "epoch": 0.13947672181608312, + "flos": 562954503168.0, + "grad_norm": 0.07538454681544235, + "language_loss": 0.93399024, + "learning_rate": 0.0009689039826650312, + "loss": 0.94499099, + "num_input_tokens_seen": 60277920, + "router_z_loss_mlp": 0.34375, + "step": 725, + "time_per_iteration": 2.769481658935547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111743, + "balance_loss_mlp": 1.09967864, + "epoch": 0.13966910350134668, + "flos": 1520699387904.0, + "grad_norm": 0.042030956775344956, + "language_loss": 0.76523066, + "learning_rate": 0.000968795738346836, + "loss": 0.77634799, + "num_input_tokens_seen": 60494224, + "router_z_loss_mlp": 0.12060547, + "step": 726, + "time_per_iteration": 4.903716802597046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101674, + "balance_loss_mlp": 1.06619751, + "epoch": 0.13986148518661023, + "flos": 499604557824.0, + "grad_norm": 0.07361028590256702, + "language_loss": 0.88265646, + "learning_rate": 0.0009686873120259941, + "loss": 0.89367324, + "num_input_tokens_seen": 60562176, + "router_z_loss_mlp": 0.35522461, + "step": 727, + "time_per_iteration": 2.639673948287964 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099007, + "balance_loss_mlp": 1.06612897, + "epoch": 0.1400538668718738, + "flos": 598381862400.0, + "grad_norm": 0.053177263225715844, + "language_loss": 0.87612498, + "learning_rate": 0.0009685787037446004, + "loss": 0.88711506, + "num_input_tokens_seen": 60631472, + "router_z_loss_mlp": 0.32885742, + "step": 728, + "time_per_iteration": 2.7457332611083984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095755, + "balance_loss_mlp": 1.06135106, + "epoch": 0.14024624855713735, + "flos": 593757941760.0, + "grad_norm": 0.0730266030670127, + "language_loss": 0.88032103, + "learning_rate": 0.0009684699135448201, + "loss": 0.89127851, + "num_input_tokens_seen": 60703488, + "router_z_loss_mlp": 0.34423828, + "step": 729, + "time_per_iteration": 2.6995558738708496 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091636, + "balance_loss_mlp": 1.05940139, + "epoch": 0.1404386302424009, + "flos": 506335311360.0, + "grad_norm": 0.06378774069808751, + "language_loss": 0.93033969, + "learning_rate": 0.0009683609414688895, + "loss": 0.94125605, + "num_input_tokens_seen": 60773936, + "router_z_loss_mlp": 0.32226562, + "step": 730, + "time_per_iteration": 2.6648926734924316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097348, + "balance_loss_mlp": 1.06175184, + "epoch": 0.14063101192766447, + "flos": 573132105216.0, + "grad_norm": 0.05452232030629634, + "language_loss": 0.86945236, + "learning_rate": 0.0009682517875591154, + "loss": 0.88042581, + "num_input_tokens_seen": 60851120, + "router_z_loss_mlp": 0.35620117, + "step": 731, + "time_per_iteration": 2.7333967685699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099629, + "balance_loss_mlp": 1.06656027, + "epoch": 0.14082339361292806, + "flos": 564333806592.0, + "grad_norm": 0.06482276791137384, + "language_loss": 0.87207299, + "learning_rate": 0.0009681424518578749, + "loss": 0.88306928, + "num_input_tokens_seen": 60924896, + "router_z_loss_mlp": 0.33081055, + "step": 732, + "time_per_iteration": 2.706704616546631 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100389, + "balance_loss_mlp": 1.06734443, + "epoch": 0.14101577529819162, + "flos": 463336187904.0, + "grad_norm": 0.05411989278901109, + "language_loss": 0.88122302, + "learning_rate": 0.000968032934407616, + "loss": 0.89222693, + "num_input_tokens_seen": 60996016, + "router_z_loss_mlp": 0.33056641, + "step": 733, + "time_per_iteration": 2.5904436111450195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100997, + "balance_loss_mlp": 1.06766593, + "epoch": 0.14120815698345518, + "flos": 595791991296.0, + "grad_norm": 0.06321555834593343, + "language_loss": 0.82077157, + "learning_rate": 0.0009679232352508571, + "loss": 0.83178151, + "num_input_tokens_seen": 61072016, + "router_z_loss_mlp": 0.33349609, + "step": 734, + "time_per_iteration": 2.758493423461914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102299, + "balance_loss_mlp": 1.06992185, + "epoch": 0.14140053866871874, + "flos": 534864591360.0, + "grad_norm": 0.05697576898708014, + "language_loss": 0.81442666, + "learning_rate": 0.0009678133544301871, + "loss": 0.82544965, + "num_input_tokens_seen": 61144528, + "router_z_loss_mlp": 0.32373047, + "step": 735, + "time_per_iteration": 2.6508195400238037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092208, + "balance_loss_mlp": 1.06006956, + "epoch": 0.1415929203539823, + "flos": 520013606400.0, + "grad_norm": 0.0400187761209974, + "language_loss": 0.91843486, + "learning_rate": 0.0009677032919882658, + "loss": 0.92935699, + "num_input_tokens_seen": 61216960, + "router_z_loss_mlp": 0.32128906, + "step": 736, + "time_per_iteration": 2.705019474029541 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100492, + "balance_loss_mlp": 1.06975937, + "epoch": 0.14178530203924586, + "flos": 482095300608.0, + "grad_norm": 0.07179339183341249, + "language_loss": 0.92199683, + "learning_rate": 0.000967593047967823, + "loss": 0.93300164, + "num_input_tokens_seen": 61281312, + "router_z_loss_mlp": 0.30712891, + "step": 737, + "time_per_iteration": 2.55415415763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109678, + "balance_loss_mlp": 1.07577443, + "epoch": 0.14197768372450942, + "flos": 676339863552.0, + "grad_norm": 0.08640894081958116, + "language_loss": 0.87084705, + "learning_rate": 0.0009674826224116593, + "loss": 0.88194382, + "num_input_tokens_seen": 61355888, + "router_z_loss_mlp": 0.33911133, + "step": 738, + "time_per_iteration": 2.819878101348877 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097633, + "balance_loss_mlp": 1.06544614, + "epoch": 0.14217006540977298, + "flos": 445802199552.0, + "grad_norm": 0.06953952980021996, + "language_loss": 0.8713401, + "learning_rate": 0.0009673720153626455, + "loss": 0.88231641, + "num_input_tokens_seen": 61424288, + "router_z_loss_mlp": 0.32177734, + "step": 739, + "time_per_iteration": 2.5987422466278076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096281, + "balance_loss_mlp": 1.06385565, + "epoch": 0.14236244709503657, + "flos": 496261016064.0, + "grad_norm": 0.08400230511878481, + "language_loss": 0.87465405, + "learning_rate": 0.0009672612268637235, + "loss": 0.88561684, + "num_input_tokens_seen": 61493344, + "router_z_loss_mlp": 0.32421875, + "step": 740, + "time_per_iteration": 2.6148736476898193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098472, + "balance_loss_mlp": 1.06669128, + "epoch": 0.14255482878030012, + "flos": 648022989312.0, + "grad_norm": 0.0806935070673247, + "language_loss": 0.846753, + "learning_rate": 0.0009671502569579048, + "loss": 0.85773772, + "num_input_tokens_seen": 61565216, + "router_z_loss_mlp": 0.31762695, + "step": 741, + "time_per_iteration": 2.7533769607543945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089599, + "balance_loss_mlp": 1.05774641, + "epoch": 0.14274721046556368, + "flos": 535888894464.0, + "grad_norm": 0.06572551706098649, + "language_loss": 0.90748239, + "learning_rate": 0.0009670391056882719, + "loss": 0.91837835, + "num_input_tokens_seen": 61640928, + "router_z_loss_mlp": 0.31835938, + "step": 742, + "time_per_iteration": 2.698690176010132 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088511, + "balance_loss_mlp": 1.0565629, + "epoch": 0.14293959215082724, + "flos": 956677215744.0, + "grad_norm": 0.07291469749344824, + "language_loss": 0.89417249, + "learning_rate": 0.0009669277730979776, + "loss": 0.90505755, + "num_input_tokens_seen": 61717552, + "router_z_loss_mlp": 0.31958008, + "step": 743, + "time_per_iteration": 3.1728732585906982 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108649, + "balance_loss_mlp": 1.05408931, + "epoch": 0.1431319738360908, + "flos": 692766590976.0, + "grad_norm": 0.06693583917292938, + "language_loss": 0.85588205, + "learning_rate": 0.0009668162592302449, + "loss": 0.86674696, + "num_input_tokens_seen": 61800016, + "router_z_loss_mlp": 0.32397461, + "step": 744, + "time_per_iteration": 2.896467685699463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099426, + "balance_loss_mlp": 1.06673896, + "epoch": 0.14332435552135436, + "flos": 565174817280.0, + "grad_norm": 0.0717564206721674, + "language_loss": 0.86683381, + "learning_rate": 0.0009667045641283676, + "loss": 0.877828, + "num_input_tokens_seen": 61865904, + "router_z_loss_mlp": 0.3269043, + "step": 745, + "time_per_iteration": 2.6326427459716797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095955, + "balance_loss_mlp": 1.06336319, + "epoch": 0.14351673720661792, + "flos": 738045665280.0, + "grad_norm": 0.07083856064802352, + "language_loss": 0.95545924, + "learning_rate": 0.0009665926878357092, + "loss": 0.96641874, + "num_input_tokens_seen": 61945728, + "router_z_loss_mlp": 0.32592773, + "step": 746, + "time_per_iteration": 2.902628183364868 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108393, + "balance_loss_mlp": 1.07565856, + "epoch": 0.14370911889188148, + "flos": 548951731200.0, + "grad_norm": 0.08672542857876225, + "language_loss": 0.91510898, + "learning_rate": 0.0009664806303957043, + "loss": 0.92619288, + "num_input_tokens_seen": 62016288, + "router_z_loss_mlp": 0.32714844, + "step": 747, + "time_per_iteration": 2.678656578063965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107271, + "balance_loss_mlp": 1.07448816, + "epoch": 0.14390150057714507, + "flos": 589973469696.0, + "grad_norm": 0.06575006445724518, + "language_loss": 0.87633115, + "learning_rate": 0.0009663683918518571, + "loss": 0.88740385, + "num_input_tokens_seen": 62097904, + "router_z_loss_mlp": 0.32788086, + "step": 748, + "time_per_iteration": 2.894339084625244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116744, + "balance_loss_mlp": 1.08226848, + "epoch": 0.14409388226240863, + "flos": 590773782528.0, + "grad_norm": 0.06412555003569581, + "language_loss": 0.86334193, + "learning_rate": 0.0009662559722477428, + "loss": 0.87450933, + "num_input_tokens_seen": 62166736, + "router_z_loss_mlp": 0.3449707, + "step": 749, + "time_per_iteration": 2.6673357486724854 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0116866, + "balance_loss_mlp": 1.15397346, + "epoch": 0.1442862639476722, + "flos": 1510418479104.0, + "grad_norm": 0.05654081816866197, + "language_loss": 0.7616297, + "learning_rate": 0.0009661433716270062, + "loss": 0.77331638, + "num_input_tokens_seen": 62402512, + "router_z_loss_mlp": 0.14648438, + "step": 750, + "time_per_iteration": 4.97744607925415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111461, + "balance_loss_mlp": 1.0782733, + "epoch": 0.14447864563293575, + "flos": 496493770752.0, + "grad_norm": 0.05840496998451829, + "language_loss": 0.89989787, + "learning_rate": 0.0009660305900333632, + "loss": 0.91101241, + "num_input_tokens_seen": 62473408, + "router_z_loss_mlp": 0.33203125, + "step": 751, + "time_per_iteration": 2.6919631958007812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108129, + "balance_loss_mlp": 1.07513142, + "epoch": 0.1446710273181993, + "flos": 589400271360.0, + "grad_norm": 0.0663289310880325, + "language_loss": 0.83084202, + "learning_rate": 0.0009659176275105992, + "loss": 0.8419233, + "num_input_tokens_seen": 62547440, + "router_z_loss_mlp": 0.33007812, + "step": 752, + "time_per_iteration": 2.702003240585327 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097403, + "balance_loss_mlp": 1.0634284, + "epoch": 0.14486340900346287, + "flos": 585521256960.0, + "grad_norm": 0.05748666507804042, + "language_loss": 0.86628646, + "learning_rate": 0.0009658044841025701, + "loss": 0.87726045, + "num_input_tokens_seen": 62620224, + "router_z_loss_mlp": 0.34008789, + "step": 753, + "time_per_iteration": 2.7666702270507812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106626, + "balance_loss_mlp": 1.07114923, + "epoch": 0.14505579068872643, + "flos": 504405978624.0, + "grad_norm": 0.07320865998852653, + "language_loss": 0.81996346, + "learning_rate": 0.0009656911598532021, + "loss": 0.83102977, + "num_input_tokens_seen": 62690464, + "router_z_loss_mlp": 0.35498047, + "step": 754, + "time_per_iteration": 2.6273839473724365 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094053, + "balance_loss_mlp": 1.05936301, + "epoch": 0.14524817237399, + "flos": 486566452224.0, + "grad_norm": 0.05776902712696923, + "language_loss": 0.90229332, + "learning_rate": 0.0009655776548064917, + "loss": 0.91323388, + "num_input_tokens_seen": 62762240, + "router_z_loss_mlp": 0.34667969, + "step": 755, + "time_per_iteration": 2.6639461517333984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092368, + "balance_loss_mlp": 1.05867922, + "epoch": 0.14544055405925355, + "flos": 727857888768.0, + "grad_norm": 0.059694446461720084, + "language_loss": 0.88762641, + "learning_rate": 0.0009654639690065054, + "loss": 0.89855003, + "num_input_tokens_seen": 62839760, + "router_z_loss_mlp": 0.33691406, + "step": 756, + "time_per_iteration": 2.881164789199829 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092737, + "balance_loss_mlp": 1.05981112, + "epoch": 0.14563293574451713, + "flos": 593359271424.0, + "grad_norm": 0.0719411984245977, + "language_loss": 0.88362074, + "learning_rate": 0.00096535010249738, + "loss": 0.89454818, + "num_input_tokens_seen": 62910336, + "router_z_loss_mlp": 0.3293457, + "step": 757, + "time_per_iteration": 2.703355312347412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092325, + "balance_loss_mlp": 1.05925632, + "epoch": 0.1458253174297807, + "flos": 560192924160.0, + "grad_norm": 0.09095988428785044, + "language_loss": 0.8300786, + "learning_rate": 0.0009652360553233224, + "loss": 0.84100187, + "num_input_tokens_seen": 62988160, + "router_z_loss_mlp": 0.33081055, + "step": 758, + "time_per_iteration": 2.7321062088012695 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01062711, + "balance_loss_mlp": 1.04821551, + "epoch": 0.14601769911504425, + "flos": 1557025984512.0, + "grad_norm": 0.03493248396843453, + "language_loss": 0.73773748, + "learning_rate": 0.0009651218275286093, + "loss": 0.74836457, + "num_input_tokens_seen": 63224704, + "router_z_loss_mlp": 0.14453125, + "step": 759, + "time_per_iteration": 4.917184591293335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099151, + "balance_loss_mlp": 1.06605887, + "epoch": 0.1462100808003078, + "flos": 865922628096.0, + "grad_norm": 0.05465610046720203, + "language_loss": 0.8166393, + "learning_rate": 0.0009650074191575883, + "loss": 0.82763088, + "num_input_tokens_seen": 63312400, + "router_z_loss_mlp": 0.33105469, + "step": 760, + "time_per_iteration": 3.2009472846984863 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097005, + "balance_loss_mlp": 1.06341171, + "epoch": 0.14640246248557137, + "flos": 522673288704.0, + "grad_norm": 0.07890258703475667, + "language_loss": 0.86329532, + "learning_rate": 0.0009648928302546766, + "loss": 0.87426543, + "num_input_tokens_seen": 63387792, + "router_z_loss_mlp": 0.3359375, + "step": 761, + "time_per_iteration": 2.6858482360839844 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087335, + "balance_loss_mlp": 1.05340791, + "epoch": 0.14659484417083493, + "flos": 1030121805312.0, + "grad_norm": 0.05505233607608704, + "language_loss": 0.8584463, + "learning_rate": 0.0009647780608643613, + "loss": 0.86931968, + "num_input_tokens_seen": 63475632, + "router_z_loss_mlp": 0.33935547, + "step": 762, + "time_per_iteration": 3.3784618377685547 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087006, + "balance_loss_mlp": 1.05365133, + "epoch": 0.1467872258560985, + "flos": 500426629632.0, + "grad_norm": 0.083565321416964, + "language_loss": 0.88299912, + "learning_rate": 0.0009646631110312001, + "loss": 0.89386916, + "num_input_tokens_seen": 63546080, + "router_z_loss_mlp": 0.33349609, + "step": 763, + "time_per_iteration": 2.642038345336914 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096574, + "balance_loss_mlp": 1.06465006, + "epoch": 0.14697960754136205, + "flos": 547514201088.0, + "grad_norm": 0.05646167170610495, + "language_loss": 0.88908124, + "learning_rate": 0.0009645479807998203, + "loss": 0.900047, + "num_input_tokens_seen": 63622464, + "router_z_loss_mlp": 0.3190918, + "step": 764, + "time_per_iteration": 2.7709102630615234 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093321, + "balance_loss_mlp": 1.0614922, + "epoch": 0.14717198922662564, + "flos": 517586678784.0, + "grad_norm": 0.06731397985108602, + "language_loss": 0.93233657, + "learning_rate": 0.0009644326702149196, + "loss": 0.94326979, + "num_input_tokens_seen": 63694736, + "router_z_loss_mlp": 0.31811523, + "step": 765, + "time_per_iteration": 2.691761016845703 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098194, + "balance_loss_mlp": 1.06472015, + "epoch": 0.1473643709118892, + "flos": 731661147648.0, + "grad_norm": 0.08664060064789567, + "language_loss": 0.85604531, + "learning_rate": 0.0009643171793212653, + "loss": 0.86702728, + "num_input_tokens_seen": 63779072, + "router_z_loss_mlp": 0.33496094, + "step": 766, + "time_per_iteration": 3.0578510761260986 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095374, + "balance_loss_mlp": 1.06190002, + "epoch": 0.14755675259715276, + "flos": 620257554432.0, + "grad_norm": 0.06875066800131625, + "language_loss": 0.90379435, + "learning_rate": 0.0009642015081636952, + "loss": 0.91474807, + "num_input_tokens_seen": 63847472, + "router_z_loss_mlp": 0.33496094, + "step": 767, + "time_per_iteration": 2.690892219543457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091039, + "balance_loss_mlp": 1.05830407, + "epoch": 0.14774913428241632, + "flos": 451981513728.0, + "grad_norm": 0.06617868208271054, + "language_loss": 0.88812423, + "learning_rate": 0.0009640856567871166, + "loss": 0.89903462, + "num_input_tokens_seen": 63912496, + "router_z_loss_mlp": 0.32714844, + "step": 768, + "time_per_iteration": 2.5108768939971924 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086849, + "balance_loss_mlp": 1.05316067, + "epoch": 0.14794151596767988, + "flos": 836881196544.0, + "grad_norm": 0.06813910901976611, + "language_loss": 0.89643073, + "learning_rate": 0.0009639696252365072, + "loss": 0.90729922, + "num_input_tokens_seen": 63990832, + "router_z_loss_mlp": 0.33691406, + "step": 769, + "time_per_iteration": 3.036872386932373 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087914, + "balance_loss_mlp": 1.05546558, + "epoch": 0.14813389765294344, + "flos": 685765204992.0, + "grad_norm": 0.06952898718112278, + "language_loss": 0.82433641, + "learning_rate": 0.0009638534135569144, + "loss": 0.83521557, + "num_input_tokens_seen": 64067552, + "router_z_loss_mlp": 0.32446289, + "step": 770, + "time_per_iteration": 2.920228958129883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096521, + "balance_loss_mlp": 1.06395316, + "epoch": 0.148326279338207, + "flos": 509625008640.0, + "grad_norm": 0.05850145176667806, + "language_loss": 0.90417981, + "learning_rate": 0.0009637370217934554, + "loss": 0.91514498, + "num_input_tokens_seen": 64140336, + "router_z_loss_mlp": 0.32568359, + "step": 771, + "time_per_iteration": 2.6692943572998047 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.0624088, + "epoch": 0.14851866102347056, + "flos": 587869608960.0, + "grad_norm": 0.06374792966079154, + "language_loss": 0.83362675, + "learning_rate": 0.0009636204499913175, + "loss": 0.84457153, + "num_input_tokens_seen": 64223472, + "router_z_loss_mlp": 0.32055664, + "step": 772, + "time_per_iteration": 2.9103784561157227 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101595, + "balance_loss_mlp": 1.07129157, + "epoch": 0.14871104270873411, + "flos": 690722366976.0, + "grad_norm": 0.05784692032564958, + "language_loss": 0.8891257, + "learning_rate": 0.0009635036981957581, + "loss": 0.90014172, + "num_input_tokens_seen": 64299872, + "router_z_loss_mlp": 0.30273438, + "step": 773, + "time_per_iteration": 2.840233087539673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109331, + "balance_loss_mlp": 1.06112361, + "epoch": 0.1489034243939977, + "flos": 654803205120.0, + "grad_norm": 0.06091674471201955, + "language_loss": 0.9126395, + "learning_rate": 0.0009633867664521043, + "loss": 0.9235726, + "num_input_tokens_seen": 64377152, + "router_z_loss_mlp": 0.32202148, + "step": 774, + "time_per_iteration": 2.8467912673950195 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098219, + "balance_loss_mlp": 1.0643878, + "epoch": 0.14909580607926126, + "flos": 475595891712.0, + "grad_norm": 0.06395321005815084, + "language_loss": 0.87366414, + "learning_rate": 0.0009632696548057527, + "loss": 0.8846463, + "num_input_tokens_seen": 64443008, + "router_z_loss_mlp": 0.33862305, + "step": 775, + "time_per_iteration": 2.55267596244812 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090534, + "balance_loss_mlp": 1.05729866, + "epoch": 0.14928818776452482, + "flos": 610789953024.0, + "grad_norm": 0.07257335679926562, + "language_loss": 0.85489643, + "learning_rate": 0.0009631523633021704, + "loss": 0.86580181, + "num_input_tokens_seen": 64519776, + "router_z_loss_mlp": 0.33251953, + "step": 776, + "time_per_iteration": 2.800656795501709 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090063, + "balance_loss_mlp": 1.05694628, + "epoch": 0.14948056944978838, + "flos": 561487859712.0, + "grad_norm": 0.058446141184189525, + "language_loss": 0.88943005, + "learning_rate": 0.0009630348919868936, + "loss": 0.90033066, + "num_input_tokens_seen": 64593712, + "router_z_loss_mlp": 0.33129883, + "step": 777, + "time_per_iteration": 2.7306644916534424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088502, + "balance_loss_mlp": 1.05397916, + "epoch": 0.14967295113505194, + "flos": 448972623360.0, + "grad_norm": 0.08136314957760014, + "language_loss": 0.81536144, + "learning_rate": 0.0009629172409055293, + "loss": 0.82624644, + "num_input_tokens_seen": 64658448, + "router_z_loss_mlp": 0.34545898, + "step": 778, + "time_per_iteration": 2.532480239868164 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091534, + "balance_loss_mlp": 1.05937171, + "epoch": 0.1498653328203155, + "flos": 571000541184.0, + "grad_norm": 0.06865521140792329, + "language_loss": 0.88039231, + "learning_rate": 0.0009627994101037531, + "loss": 0.89130771, + "num_input_tokens_seen": 64734144, + "router_z_loss_mlp": 0.3215332, + "step": 779, + "time_per_iteration": 2.7336056232452393 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091586, + "balance_loss_mlp": 1.05811191, + "epoch": 0.15005771450557906, + "flos": 630918194688.0, + "grad_norm": 0.06277485509918372, + "language_loss": 0.8981787, + "learning_rate": 0.0009626813996273114, + "loss": 0.90909451, + "num_input_tokens_seen": 64813456, + "router_z_loss_mlp": 0.3347168, + "step": 780, + "time_per_iteration": 2.8651859760284424 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092654, + "balance_loss_mlp": 1.06018162, + "epoch": 0.15025009619084262, + "flos": 577633780224.0, + "grad_norm": 0.06737111741199381, + "language_loss": 0.89359641, + "learning_rate": 0.0009625632095220198, + "loss": 0.90452296, + "num_input_tokens_seen": 64896816, + "router_z_loss_mlp": 0.32470703, + "step": 781, + "time_per_iteration": 2.910163640975952 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093984, + "balance_loss_mlp": 1.06041455, + "epoch": 0.1504424778761062, + "flos": 483646311936.0, + "grad_norm": 0.06188715182302237, + "language_loss": 0.87568116, + "learning_rate": 0.0009624448398337637, + "loss": 0.88662094, + "num_input_tokens_seen": 64964176, + "router_z_loss_mlp": 0.3359375, + "step": 782, + "time_per_iteration": 2.532055616378784 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100397, + "balance_loss_mlp": 1.06751907, + "epoch": 0.15063485956136977, + "flos": 762167812608.0, + "grad_norm": 0.06229794960735175, + "language_loss": 0.89905757, + "learning_rate": 0.0009623262906084984, + "loss": 0.91006154, + "num_input_tokens_seen": 65042592, + "router_z_loss_mlp": 0.32861328, + "step": 783, + "time_per_iteration": 2.9851605892181396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104855, + "balance_loss_mlp": 1.0712378, + "epoch": 0.15082724124663333, + "flos": 497369687040.0, + "grad_norm": 0.060596744514248076, + "language_loss": 0.90796679, + "learning_rate": 0.0009622075618922486, + "loss": 0.91901541, + "num_input_tokens_seen": 65114576, + "router_z_loss_mlp": 0.33642578, + "step": 784, + "time_per_iteration": 2.6796786785125732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102349, + "balance_loss_mlp": 1.06928015, + "epoch": 0.15101962293189689, + "flos": 509476621824.0, + "grad_norm": 0.06389342174673626, + "language_loss": 0.87423813, + "learning_rate": 0.0009620886537311091, + "loss": 0.8852616, + "num_input_tokens_seen": 65186640, + "router_z_loss_mlp": 0.33081055, + "step": 785, + "time_per_iteration": 2.6153056621551514 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113163, + "balance_loss_mlp": 1.07685184, + "epoch": 0.15121200461716044, + "flos": 457520638464.0, + "grad_norm": 0.06793935281312648, + "language_loss": 0.85492945, + "learning_rate": 0.000961969566171244, + "loss": 0.86606109, + "num_input_tokens_seen": 65252112, + "router_z_loss_mlp": 0.36303711, + "step": 786, + "time_per_iteration": 2.506267786026001 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122272, + "balance_loss_mlp": 1.08703363, + "epoch": 0.151404386302424, + "flos": 537729477120.0, + "grad_norm": 0.0670602351843582, + "language_loss": 0.90370345, + "learning_rate": 0.0009618502992588873, + "loss": 0.91492617, + "num_input_tokens_seen": 65318912, + "router_z_loss_mlp": 0.35253906, + "step": 787, + "time_per_iteration": 2.623457670211792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01141844, + "balance_loss_mlp": 1.10658193, + "epoch": 0.15159676798768756, + "flos": 687858891264.0, + "grad_norm": 0.06543467559167064, + "language_loss": 0.88581872, + "learning_rate": 0.0009617308530403424, + "loss": 0.89723718, + "num_input_tokens_seen": 65395424, + "router_z_loss_mlp": 0.35302734, + "step": 788, + "time_per_iteration": 2.975861072540283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01149381, + "balance_loss_mlp": 1.11371326, + "epoch": 0.15178914967295112, + "flos": 545042193408.0, + "grad_norm": 0.059566397417978756, + "language_loss": 0.87806541, + "learning_rate": 0.0009616112275619825, + "loss": 0.88955921, + "num_input_tokens_seen": 65470480, + "router_z_loss_mlp": 0.35668945, + "step": 789, + "time_per_iteration": 2.683262348175049 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01152452, + "balance_loss_mlp": 1.1169517, + "epoch": 0.1519815313582147, + "flos": 511510671360.0, + "grad_norm": 0.05728483560240697, + "language_loss": 0.84466863, + "learning_rate": 0.0009614914228702503, + "loss": 0.85619313, + "num_input_tokens_seen": 65544720, + "router_z_loss_mlp": 0.35498047, + "step": 790, + "time_per_iteration": 2.6616339683532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01142719, + "balance_loss_mlp": 1.10850596, + "epoch": 0.15217391304347827, + "flos": 683747122176.0, + "grad_norm": 0.057799273493116435, + "language_loss": 0.89279461, + "learning_rate": 0.0009613714390116581, + "loss": 0.90422177, + "num_input_tokens_seen": 65627872, + "router_z_loss_mlp": 0.34204102, + "step": 791, + "time_per_iteration": 2.947608470916748 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133841, + "balance_loss_mlp": 1.0997231, + "epoch": 0.15236629472874183, + "flos": 643873342464.0, + "grad_norm": 0.06413295296627212, + "language_loss": 0.86589968, + "learning_rate": 0.0009612512760327879, + "loss": 0.87723809, + "num_input_tokens_seen": 65705264, + "router_z_loss_mlp": 0.34155273, + "step": 792, + "time_per_iteration": 2.8261189460754395 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124449, + "balance_loss_mlp": 1.08727932, + "epoch": 0.1525586764140054, + "flos": 412654791168.0, + "grad_norm": 0.06095846853214657, + "language_loss": 0.85749042, + "learning_rate": 0.0009611309339802909, + "loss": 0.86873484, + "num_input_tokens_seen": 65768592, + "router_z_loss_mlp": 0.37182617, + "step": 793, + "time_per_iteration": 2.438474178314209 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113041, + "balance_loss_mlp": 1.07811236, + "epoch": 0.15275105809926895, + "flos": 802444644864.0, + "grad_norm": 0.04691390558901254, + "language_loss": 0.84620011, + "learning_rate": 0.0009610104129008881, + "loss": 0.85733056, + "num_input_tokens_seen": 65852432, + "router_z_loss_mlp": 0.34985352, + "step": 794, + "time_per_iteration": 3.1149892807006836 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092099, + "balance_loss_mlp": 1.05786228, + "epoch": 0.1529434397845325, + "flos": 612143115264.0, + "grad_norm": 0.06446455819394356, + "language_loss": 0.88995111, + "learning_rate": 0.0009608897128413701, + "loss": 0.90087205, + "num_input_tokens_seen": 65927904, + "router_z_loss_mlp": 0.3425293, + "step": 795, + "time_per_iteration": 2.7310965061187744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085841, + "balance_loss_mlp": 1.05160367, + "epoch": 0.15313582146979607, + "flos": 614941009920.0, + "grad_norm": 0.04580320827636504, + "language_loss": 0.8595438, + "learning_rate": 0.0009607688338485965, + "loss": 0.87040222, + "num_input_tokens_seen": 66006800, + "router_z_loss_mlp": 0.3425293, + "step": 796, + "time_per_iteration": 2.8534584045410156 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088916, + "balance_loss_mlp": 1.05427384, + "epoch": 0.15332820315505963, + "flos": 793256440320.0, + "grad_norm": 0.053101967265095064, + "language_loss": 0.91128695, + "learning_rate": 0.0009606477759694969, + "loss": 0.92217612, + "num_input_tokens_seen": 66088608, + "router_z_loss_mlp": 0.34643555, + "step": 797, + "time_per_iteration": 3.0544466972351074 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098724, + "balance_loss_mlp": 1.06441545, + "epoch": 0.1535205848403232, + "flos": 549945510912.0, + "grad_norm": 0.0662794157411924, + "language_loss": 0.87591946, + "learning_rate": 0.0009605265392510703, + "loss": 0.88690674, + "num_input_tokens_seen": 66153616, + "router_z_loss_mlp": 0.34350586, + "step": 798, + "time_per_iteration": 2.6120660305023193 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011091, + "balance_loss_mlp": 1.07417202, + "epoch": 0.15371296652558677, + "flos": 535691045376.0, + "grad_norm": 0.07220239734969772, + "language_loss": 0.92342889, + "learning_rate": 0.0009604051237403846, + "loss": 0.93451989, + "num_input_tokens_seen": 66219472, + "router_z_loss_mlp": 0.34960938, + "step": 799, + "time_per_iteration": 2.640749216079712 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111728, + "balance_loss_mlp": 1.07808757, + "epoch": 0.15390534821085033, + "flos": 395002939392.0, + "grad_norm": 0.06314402273456009, + "language_loss": 0.86126584, + "learning_rate": 0.0009602835294845776, + "loss": 0.87238312, + "num_input_tokens_seen": 66281456, + "router_z_loss_mlp": 0.33666992, + "step": 800, + "time_per_iteration": 2.44914174079895 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117351, + "balance_loss_mlp": 1.08254242, + "epoch": 0.1540977298961139, + "flos": 535587738624.0, + "grad_norm": 0.057636094576239, + "language_loss": 0.91100746, + "learning_rate": 0.0009601617565308565, + "loss": 0.92218101, + "num_input_tokens_seen": 66348160, + "router_z_loss_mlp": 0.34790039, + "step": 801, + "time_per_iteration": 2.599679470062256 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119062, + "balance_loss_mlp": 1.08511138, + "epoch": 0.15429011158137745, + "flos": 723388147200.0, + "grad_norm": 0.05961266019354579, + "language_loss": 0.86783326, + "learning_rate": 0.0009600398049264977, + "loss": 0.87902391, + "num_input_tokens_seen": 66430576, + "router_z_loss_mlp": 0.33935547, + "step": 802, + "time_per_iteration": 2.9514007568359375 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121787, + "balance_loss_mlp": 1.08735943, + "epoch": 0.154482493266641, + "flos": 620209502208.0, + "grad_norm": 0.06366105456569557, + "language_loss": 0.92098475, + "learning_rate": 0.0009599176747188469, + "loss": 0.9322027, + "num_input_tokens_seen": 66506480, + "router_z_loss_mlp": 0.34448242, + "step": 803, + "time_per_iteration": 2.8068411350250244 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114024, + "balance_loss_mlp": 1.08012128, + "epoch": 0.15467487495190457, + "flos": 525351909888.0, + "grad_norm": 0.08101366702111423, + "language_loss": 0.83651662, + "learning_rate": 0.0009597953659553196, + "loss": 0.84765685, + "num_input_tokens_seen": 66577680, + "router_z_loss_mlp": 0.33911133, + "step": 804, + "time_per_iteration": 2.7075448036193848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098377, + "balance_loss_mlp": 1.06616712, + "epoch": 0.15486725663716813, + "flos": 527473299456.0, + "grad_norm": 0.07377431927286832, + "language_loss": 0.89624304, + "learning_rate": 0.0009596728786833997, + "loss": 0.90722686, + "num_input_tokens_seen": 66648496, + "router_z_loss_mlp": 0.32202148, + "step": 805, + "time_per_iteration": 2.6376051902770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088085, + "balance_loss_mlp": 1.05420554, + "epoch": 0.1550596383224317, + "flos": 1048118482944.0, + "grad_norm": 0.06708822771662253, + "language_loss": 0.90018022, + "learning_rate": 0.0009595502129506415, + "loss": 0.91106105, + "num_input_tokens_seen": 66735216, + "router_z_loss_mlp": 0.33911133, + "step": 806, + "time_per_iteration": 3.3391284942626953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092582, + "balance_loss_mlp": 1.05903625, + "epoch": 0.15525202000769528, + "flos": 613438050816.0, + "grad_norm": 0.06052700763637142, + "language_loss": 0.83084035, + "learning_rate": 0.0009594273688046678, + "loss": 0.84176612, + "num_input_tokens_seen": 66810672, + "router_z_loss_mlp": 0.33544922, + "step": 807, + "time_per_iteration": 2.7136006355285645 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088184, + "balance_loss_mlp": 1.05273128, + "epoch": 0.15544440169295884, + "flos": 532805810688.0, + "grad_norm": 0.07048562468234597, + "language_loss": 0.86048424, + "learning_rate": 0.000959304346293171, + "loss": 0.87136608, + "num_input_tokens_seen": 66879824, + "router_z_loss_mlp": 0.35473633, + "step": 808, + "time_per_iteration": 2.6744906902313232 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097573, + "balance_loss_mlp": 1.06254935, + "epoch": 0.1556367833782224, + "flos": 644433546240.0, + "grad_norm": 0.06803397985071584, + "language_loss": 0.88331544, + "learning_rate": 0.0009591811454639125, + "loss": 0.89429116, + "num_input_tokens_seen": 66949424, + "router_z_loss_mlp": 0.3503418, + "step": 809, + "time_per_iteration": 2.730431079864502 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.0610261, + "epoch": 0.15582916506348596, + "flos": 543540644352.0, + "grad_norm": 0.06204685505428811, + "language_loss": 0.88227659, + "learning_rate": 0.0009590577663647234, + "loss": 0.89322132, + "num_input_tokens_seen": 67024000, + "router_z_loss_mlp": 0.3347168, + "step": 810, + "time_per_iteration": 2.71469783782959 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104684, + "balance_loss_mlp": 1.07078123, + "epoch": 0.15602154674874952, + "flos": 579740613120.0, + "grad_norm": 0.05672341894910533, + "language_loss": 0.86610442, + "learning_rate": 0.0009589342090435036, + "loss": 0.87715125, + "num_input_tokens_seen": 67100672, + "router_z_loss_mlp": 0.33935547, + "step": 811, + "time_per_iteration": 2.799246072769165 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110387, + "balance_loss_mlp": 1.06918025, + "epoch": 0.15621392843401308, + "flos": 534982454784.0, + "grad_norm": 0.0647852675732537, + "language_loss": 0.87778354, + "learning_rate": 0.0009588104735482223, + "loss": 0.8888222, + "num_input_tokens_seen": 67171584, + "router_z_loss_mlp": 0.34692383, + "step": 812, + "time_per_iteration": 2.6684510707855225 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126921, + "balance_loss_mlp": 1.09106326, + "epoch": 0.15640631011927664, + "flos": 550635162624.0, + "grad_norm": 0.08222618986335321, + "language_loss": 0.84280443, + "learning_rate": 0.0009586865599269177, + "loss": 0.85407358, + "num_input_tokens_seen": 67240640, + "router_z_loss_mlp": 0.35864258, + "step": 813, + "time_per_iteration": 2.6293816566467285 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01131277, + "balance_loss_mlp": 1.09651566, + "epoch": 0.1565986918045402, + "flos": 637190641152.0, + "grad_norm": 0.05945515562529824, + "language_loss": 0.88725412, + "learning_rate": 0.0009585624682276977, + "loss": 0.89856684, + "num_input_tokens_seen": 67312976, + "router_z_loss_mlp": 0.34814453, + "step": 814, + "time_per_iteration": 2.744253158569336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137563, + "balance_loss_mlp": 1.10113239, + "epoch": 0.15679107348980378, + "flos": 490569122304.0, + "grad_norm": 0.09591637295165127, + "language_loss": 0.87945771, + "learning_rate": 0.0009584381984987386, + "loss": 0.89083332, + "num_input_tokens_seen": 67378528, + "router_z_loss_mlp": 0.36474609, + "step": 815, + "time_per_iteration": 2.5264036655426025 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124613, + "balance_loss_mlp": 1.0911386, + "epoch": 0.15698345517506734, + "flos": 529689231360.0, + "grad_norm": 0.05838460881618622, + "language_loss": 0.90277314, + "learning_rate": 0.0009583137507882864, + "loss": 0.91401929, + "num_input_tokens_seen": 67449728, + "router_z_loss_mlp": 0.3347168, + "step": 816, + "time_per_iteration": 2.6488330364227295 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109898, + "balance_loss_mlp": 1.07418323, + "epoch": 0.1571758368603309, + "flos": 545779897344.0, + "grad_norm": 0.07313796537718548, + "language_loss": 0.81262791, + "learning_rate": 0.000958189125144656, + "loss": 0.82372689, + "num_input_tokens_seen": 67520512, + "router_z_loss_mlp": 0.35766602, + "step": 817, + "time_per_iteration": 2.7040657997131348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101746, + "balance_loss_mlp": 1.06672239, + "epoch": 0.15736821854559446, + "flos": 565377048576.0, + "grad_norm": 0.067694528538076, + "language_loss": 0.88558215, + "learning_rate": 0.0009580643216162313, + "loss": 0.89659959, + "num_input_tokens_seen": 67592464, + "router_z_loss_mlp": 0.3503418, + "step": 818, + "time_per_iteration": 2.6538634300231934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096156, + "balance_loss_mlp": 1.06110835, + "epoch": 0.15756060023085802, + "flos": 500707436544.0, + "grad_norm": 0.05957146674366314, + "language_loss": 0.79884583, + "learning_rate": 0.0009579393402514652, + "loss": 0.80980736, + "num_input_tokens_seen": 67658928, + "router_z_loss_mlp": 0.35107422, + "step": 819, + "time_per_iteration": 2.5606625080108643 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082975, + "balance_loss_mlp": 1.048738, + "epoch": 0.15775298191612158, + "flos": 519014034432.0, + "grad_norm": 0.06194437160070725, + "language_loss": 0.91126758, + "learning_rate": 0.0009578141810988801, + "loss": 0.92209733, + "num_input_tokens_seen": 67727936, + "router_z_loss_mlp": 0.34228516, + "step": 820, + "time_per_iteration": 2.55538010597229 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082194, + "balance_loss_mlp": 1.04712272, + "epoch": 0.15794536360138514, + "flos": 465891153408.0, + "grad_norm": 0.060184436438788555, + "language_loss": 0.91010749, + "learning_rate": 0.0009576888442070668, + "loss": 0.92092943, + "num_input_tokens_seen": 67795488, + "router_z_loss_mlp": 0.35083008, + "step": 821, + "time_per_iteration": 2.6139276027679443 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094225, + "balance_loss_mlp": 1.05982161, + "epoch": 0.1581377452866487, + "flos": 516911583744.0, + "grad_norm": 0.06832586535724347, + "language_loss": 0.92820144, + "learning_rate": 0.0009575633296246854, + "loss": 0.93914366, + "num_input_tokens_seen": 67858896, + "router_z_loss_mlp": 0.34423828, + "step": 822, + "time_per_iteration": 2.557404041290283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096972, + "balance_loss_mlp": 1.06242526, + "epoch": 0.15833012697191226, + "flos": 549522109440.0, + "grad_norm": 0.06257557491721027, + "language_loss": 0.83520567, + "learning_rate": 0.0009574376374004652, + "loss": 0.84617537, + "num_input_tokens_seen": 67924864, + "router_z_loss_mlp": 0.34570312, + "step": 823, + "time_per_iteration": 2.673220157623291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100077, + "balance_loss_mlp": 1.06395626, + "epoch": 0.15852250865717585, + "flos": 487206641664.0, + "grad_norm": 0.07116075590187526, + "language_loss": 0.81073487, + "learning_rate": 0.000957311767583204, + "loss": 0.82173562, + "num_input_tokens_seen": 67992912, + "router_z_loss_mlp": 0.36132812, + "step": 824, + "time_per_iteration": 2.605074882507324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01159484, + "balance_loss_mlp": 1.14126849, + "epoch": 0.1587148903424394, + "flos": 1309041672192.0, + "grad_norm": 0.051809649393169656, + "language_loss": 0.8207159, + "learning_rate": 0.0009571857202217691, + "loss": 0.83231074, + "num_input_tokens_seen": 68207408, + "router_z_loss_mlp": 0.18261719, + "step": 825, + "time_per_iteration": 4.726073265075684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111019, + "balance_loss_mlp": 1.07349157, + "epoch": 0.15890727202770297, + "flos": 466634649600.0, + "grad_norm": 0.07947222616221912, + "language_loss": 0.92132723, + "learning_rate": 0.0009570594953650961, + "loss": 0.93243748, + "num_input_tokens_seen": 68270864, + "router_z_loss_mlp": 0.37524414, + "step": 826, + "time_per_iteration": 2.5146830081939697 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109067, + "balance_loss_mlp": 1.07208848, + "epoch": 0.15909965371296653, + "flos": 776733608448.0, + "grad_norm": 0.06013225990958685, + "language_loss": 0.80852252, + "learning_rate": 0.00095693309306219, + "loss": 0.81961316, + "num_input_tokens_seen": 68355408, + "router_z_loss_mlp": 0.36962891, + "step": 827, + "time_per_iteration": 3.095632553100586 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117102, + "balance_loss_mlp": 1.07945621, + "epoch": 0.1592920353982301, + "flos": 1077852538368.0, + "grad_norm": 0.05984978885312211, + "language_loss": 0.88600951, + "learning_rate": 0.0009568065133621244, + "loss": 0.89718056, + "num_input_tokens_seen": 68437072, + "router_z_loss_mlp": 0.37646484, + "step": 828, + "time_per_iteration": 3.3153574466705322 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111269, + "balance_loss_mlp": 1.07584, + "epoch": 0.15948441708349365, + "flos": 725307305472.0, + "grad_norm": 0.0632864692280333, + "language_loss": 0.85493571, + "learning_rate": 0.0009566797563140422, + "loss": 0.86604846, + "num_input_tokens_seen": 68511696, + "router_z_loss_mlp": 0.35449219, + "step": 829, + "time_per_iteration": 2.8705785274505615 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121543, + "balance_loss_mlp": 1.08470702, + "epoch": 0.1596767987687572, + "flos": 578447087616.0, + "grad_norm": 0.06433687205870958, + "language_loss": 0.88630873, + "learning_rate": 0.0009565528219671547, + "loss": 0.89752412, + "num_input_tokens_seen": 68587488, + "router_z_loss_mlp": 0.36816406, + "step": 830, + "time_per_iteration": 2.8890771865844727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01137333, + "balance_loss_mlp": 1.10049748, + "epoch": 0.15986918045402077, + "flos": 528728947200.0, + "grad_norm": 0.04994246668943954, + "language_loss": 0.85232639, + "learning_rate": 0.0009564257103707418, + "loss": 0.86369967, + "num_input_tokens_seen": 68655760, + "router_z_loss_mlp": 0.36816406, + "step": 831, + "time_per_iteration": 2.5870308876037598 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133852, + "balance_loss_mlp": 1.09632492, + "epoch": 0.16006156213928435, + "flos": 574313559552.0, + "grad_norm": 0.0648316290803925, + "language_loss": 0.91675746, + "learning_rate": 0.0009562984215741533, + "loss": 0.92809594, + "num_input_tokens_seen": 68724560, + "router_z_loss_mlp": 0.37524414, + "step": 832, + "time_per_iteration": 2.655066967010498 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117496, + "balance_loss_mlp": 1.08170903, + "epoch": 0.1602539438245479, + "flos": 515258675712.0, + "grad_norm": 0.14271195523272245, + "language_loss": 0.82911491, + "learning_rate": 0.0009561709556268065, + "loss": 0.84028995, + "num_input_tokens_seen": 68795440, + "router_z_loss_mlp": 0.35839844, + "step": 833, + "time_per_iteration": 2.69999098777771 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01119914, + "balance_loss_mlp": 1.08419931, + "epoch": 0.16044632550981147, + "flos": 620730418176.0, + "grad_norm": 0.05962773238435596, + "language_loss": 0.95060706, + "learning_rate": 0.0009560433125781884, + "loss": 0.96180618, + "num_input_tokens_seen": 68868176, + "router_z_loss_mlp": 0.35693359, + "step": 834, + "time_per_iteration": 2.711109161376953 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126421, + "balance_loss_mlp": 1.08977628, + "epoch": 0.16063870719507503, + "flos": 560817146880.0, + "grad_norm": 0.06388697234939344, + "language_loss": 0.92829657, + "learning_rate": 0.0009559154924778544, + "loss": 0.93956077, + "num_input_tokens_seen": 68939616, + "router_z_loss_mlp": 0.36621094, + "step": 835, + "time_per_iteration": 2.695260763168335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121916, + "balance_loss_mlp": 1.08789361, + "epoch": 0.1608310888803386, + "flos": 804778440192.0, + "grad_norm": 0.05750453212643973, + "language_loss": 0.85217482, + "learning_rate": 0.0009557874953754284, + "loss": 0.86339402, + "num_input_tokens_seen": 69016192, + "router_z_loss_mlp": 0.34057617, + "step": 836, + "time_per_iteration": 3.002013921737671 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01126166, + "balance_loss_mlp": 1.09204817, + "epoch": 0.16102347056560215, + "flos": 600311195136.0, + "grad_norm": 0.06332628409766573, + "language_loss": 0.84060842, + "learning_rate": 0.0009556593213206038, + "loss": 0.85187006, + "num_input_tokens_seen": 69089360, + "router_z_loss_mlp": 0.34130859, + "step": 837, + "time_per_iteration": 2.698716163635254 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125003, + "balance_loss_mlp": 1.09102869, + "epoch": 0.1612158522508657, + "flos": 553235208192.0, + "grad_norm": 0.07524747482874264, + "language_loss": 0.87844718, + "learning_rate": 0.0009555309703631414, + "loss": 0.88969719, + "num_input_tokens_seen": 69161952, + "router_z_loss_mlp": 0.33984375, + "step": 838, + "time_per_iteration": 2.669588327407837 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133813, + "balance_loss_mlp": 1.09752607, + "epoch": 0.16140823393612927, + "flos": 555701423616.0, + "grad_norm": 0.07144746672945328, + "language_loss": 0.87685311, + "learning_rate": 0.0009554024425528722, + "loss": 0.88819122, + "num_input_tokens_seen": 69232432, + "router_z_loss_mlp": 0.36279297, + "step": 839, + "time_per_iteration": 2.6809709072113037 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0112028, + "balance_loss_mlp": 1.08737814, + "epoch": 0.16160061562139286, + "flos": 543613427712.0, + "grad_norm": 0.06970106087394082, + "language_loss": 0.8929134, + "learning_rate": 0.0009552737379396948, + "loss": 0.90411627, + "num_input_tokens_seen": 69297696, + "router_z_loss_mlp": 0.32885742, + "step": 840, + "time_per_iteration": 2.6100995540618896 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102587, + "balance_loss_mlp": 1.06920815, + "epoch": 0.16179299730665642, + "flos": 603590717952.0, + "grad_norm": 0.06131687325166246, + "language_loss": 0.87945604, + "learning_rate": 0.0009551448565735767, + "loss": 0.89048195, + "num_input_tokens_seen": 69373888, + "router_z_loss_mlp": 0.33398438, + "step": 841, + "time_per_iteration": 2.7360360622406006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095736, + "balance_loss_mlp": 1.06168985, + "epoch": 0.16198537899191998, + "flos": 786821050368.0, + "grad_norm": 0.07162496841720159, + "language_loss": 0.8519845, + "learning_rate": 0.0009550157985045543, + "loss": 0.86294186, + "num_input_tokens_seen": 69449984, + "router_z_loss_mlp": 0.34082031, + "step": 842, + "time_per_iteration": 3.0436456203460693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087632, + "balance_loss_mlp": 1.05413389, + "epoch": 0.16217776067718354, + "flos": 519550917120.0, + "grad_norm": 0.060562390499230526, + "language_loss": 0.89622426, + "learning_rate": 0.0009548865637827321, + "loss": 0.90710062, + "num_input_tokens_seen": 69522736, + "router_z_loss_mlp": 0.33496094, + "step": 843, + "time_per_iteration": 2.6422221660614014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086448, + "balance_loss_mlp": 1.05342698, + "epoch": 0.1623701423624471, + "flos": 505015644672.0, + "grad_norm": 0.07097995853224412, + "language_loss": 0.90216166, + "learning_rate": 0.0009547571524582838, + "loss": 0.91302609, + "num_input_tokens_seen": 69587184, + "router_z_loss_mlp": 0.33032227, + "step": 844, + "time_per_iteration": 2.6082894802093506 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095057, + "balance_loss_mlp": 1.06031966, + "epoch": 0.16256252404771065, + "flos": 496940493312.0, + "grad_norm": 0.06932052947515681, + "language_loss": 0.92511153, + "learning_rate": 0.0009546275645814512, + "loss": 0.9360621, + "num_input_tokens_seen": 69656560, + "router_z_loss_mlp": 0.34765625, + "step": 845, + "time_per_iteration": 2.5985872745513916 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100013, + "balance_loss_mlp": 1.065418, + "epoch": 0.16275490573297421, + "flos": 502110061056.0, + "grad_norm": 0.07540183512891604, + "language_loss": 0.90294898, + "learning_rate": 0.0009544978002025446, + "loss": 0.91394913, + "num_input_tokens_seen": 69723872, + "router_z_loss_mlp": 0.34619141, + "step": 846, + "time_per_iteration": 2.5778391361236572 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096289, + "balance_loss_mlp": 1.06174231, + "epoch": 0.16294728741823777, + "flos": 506952179712.0, + "grad_norm": 0.06018935314915502, + "language_loss": 0.87532055, + "learning_rate": 0.0009543678593719434, + "loss": 0.8862834, + "num_input_tokens_seen": 69795504, + "router_z_loss_mlp": 0.34570312, + "step": 847, + "time_per_iteration": 2.697566270828247 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01102624, + "balance_loss_mlp": 1.06824434, + "epoch": 0.16313966910350133, + "flos": 509418395136.0, + "grad_norm": 0.054217985504269955, + "language_loss": 0.8754853, + "learning_rate": 0.0009542377421400945, + "loss": 0.88651162, + "num_input_tokens_seen": 69873408, + "router_z_loss_mlp": 0.34375, + "step": 848, + "time_per_iteration": 2.786766290664673 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104457, + "balance_loss_mlp": 1.06847942, + "epoch": 0.16333205078876492, + "flos": 543712352256.0, + "grad_norm": 0.06122856356214084, + "language_loss": 0.83524954, + "learning_rate": 0.0009541074485575145, + "loss": 0.84629411, + "num_input_tokens_seen": 69944112, + "router_z_loss_mlp": 0.35986328, + "step": 849, + "time_per_iteration": 2.713759183883667 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098701, + "balance_loss_mlp": 1.06346297, + "epoch": 0.16352443247402848, + "flos": 507477477888.0, + "grad_norm": 0.06331477383231503, + "language_loss": 0.92240757, + "learning_rate": 0.0009539769786747874, + "loss": 0.93339461, + "num_input_tokens_seen": 70012288, + "router_z_loss_mlp": 0.35253906, + "step": 850, + "time_per_iteration": 2.589945077896118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100894, + "balance_loss_mlp": 1.06584692, + "epoch": 0.16371681415929204, + "flos": 541851420672.0, + "grad_norm": 0.06704648725492578, + "language_loss": 0.81567919, + "learning_rate": 0.0009538463325425665, + "loss": 0.82668811, + "num_input_tokens_seen": 70086560, + "router_z_loss_mlp": 0.35083008, + "step": 851, + "time_per_iteration": 2.6779844760894775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105544, + "balance_loss_mlp": 1.07042515, + "epoch": 0.1639091958445556, + "flos": 520501026816.0, + "grad_norm": 0.058426853420895056, + "language_loss": 0.8673842, + "learning_rate": 0.0009537155102115728, + "loss": 0.87843966, + "num_input_tokens_seen": 70153968, + "router_z_loss_mlp": 0.35131836, + "step": 852, + "time_per_iteration": 2.5614206790924072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106136, + "balance_loss_mlp": 1.07175565, + "epoch": 0.16410157752981916, + "flos": 547149026304.0, + "grad_norm": 0.06460558975646845, + "language_loss": 0.83482397, + "learning_rate": 0.0009535845117325961, + "loss": 0.84588534, + "num_input_tokens_seen": 70222496, + "router_z_loss_mlp": 0.34423828, + "step": 853, + "time_per_iteration": 2.6453073024749756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098632, + "balance_loss_mlp": 1.06470513, + "epoch": 0.16429395921508272, + "flos": 582561828864.0, + "grad_norm": 0.052152281018199936, + "language_loss": 0.93584174, + "learning_rate": 0.0009534533371564946, + "loss": 0.94682807, + "num_input_tokens_seen": 70301680, + "router_z_loss_mlp": 0.33959961, + "step": 854, + "time_per_iteration": 2.75186824798584 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111206, + "balance_loss_mlp": 1.07670665, + "epoch": 0.16448634090034628, + "flos": 530678628864.0, + "grad_norm": 0.06475772966833339, + "language_loss": 0.8907218, + "learning_rate": 0.0009533219865341949, + "loss": 0.90183383, + "num_input_tokens_seen": 70371152, + "router_z_loss_mlp": 0.3449707, + "step": 855, + "time_per_iteration": 2.581479787826538 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108852, + "balance_loss_mlp": 1.07285094, + "epoch": 0.16467872258560984, + "flos": 491623948800.0, + "grad_norm": 0.06378602693040462, + "language_loss": 0.87287533, + "learning_rate": 0.0009531904599166916, + "loss": 0.88396388, + "num_input_tokens_seen": 70440832, + "router_z_loss_mlp": 0.36035156, + "step": 856, + "time_per_iteration": 2.6429831981658936 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107008, + "balance_loss_mlp": 1.07141232, + "epoch": 0.16487110427087343, + "flos": 506015216640.0, + "grad_norm": 0.07162133431974482, + "language_loss": 0.85139728, + "learning_rate": 0.0009530587573550478, + "loss": 0.86246729, + "num_input_tokens_seen": 70507424, + "router_z_loss_mlp": 0.35620117, + "step": 857, + "time_per_iteration": 2.5667338371276855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01071319, + "balance_loss_mlp": 1.05434394, + "epoch": 0.16506348595613698, + "flos": 1432006553088.0, + "grad_norm": 0.02717136097410494, + "language_loss": 0.74319386, + "learning_rate": 0.0009529268789003953, + "loss": 0.75390708, + "num_input_tokens_seen": 70742320, + "router_z_loss_mlp": 0.16992188, + "step": 858, + "time_per_iteration": 5.02930474281311 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107487, + "balance_loss_mlp": 1.0740366, + "epoch": 0.16525586764140054, + "flos": 476890827264.0, + "grad_norm": 0.06438670275364486, + "language_loss": 0.90481895, + "learning_rate": 0.0009527948246039337, + "loss": 0.91589379, + "num_input_tokens_seen": 70808400, + "router_z_loss_mlp": 0.33447266, + "step": 859, + "time_per_iteration": 2.5222055912017822 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109782, + "balance_loss_mlp": 1.07618856, + "epoch": 0.1654482493266641, + "flos": 880737297408.0, + "grad_norm": 0.058857893791213665, + "language_loss": 0.88361865, + "learning_rate": 0.000952662594516931, + "loss": 0.8947165, + "num_input_tokens_seen": 70886192, + "router_z_loss_mlp": 0.33618164, + "step": 860, + "time_per_iteration": 3.065053701400757 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109979, + "balance_loss_mlp": 1.07497942, + "epoch": 0.16564063101192766, + "flos": 626527028736.0, + "grad_norm": 0.058557043780191484, + "language_loss": 0.86803752, + "learning_rate": 0.0009525301886907234, + "loss": 0.87913728, + "num_input_tokens_seen": 70964816, + "router_z_loss_mlp": 0.34985352, + "step": 861, + "time_per_iteration": 2.873415470123291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121945, + "balance_loss_mlp": 1.08537149, + "epoch": 0.16583301269719122, + "flos": 561250722816.0, + "grad_norm": 0.0761086770239273, + "language_loss": 0.8825953, + "learning_rate": 0.0009523976071767155, + "loss": 0.8938148, + "num_input_tokens_seen": 71037456, + "router_z_loss_mlp": 0.36572266, + "step": 862, + "time_per_iteration": 2.71508526802063 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115561, + "balance_loss_mlp": 1.07994115, + "epoch": 0.16602539438245478, + "flos": 567510022656.0, + "grad_norm": 0.05388299317844869, + "language_loss": 0.88433009, + "learning_rate": 0.00095226485002638, + "loss": 0.8954857, + "num_input_tokens_seen": 71111872, + "router_z_loss_mlp": 0.35620117, + "step": 863, + "time_per_iteration": 2.7524497509002686 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111173, + "balance_loss_mlp": 1.07617354, + "epoch": 0.16621777606771834, + "flos": 574589984256.0, + "grad_norm": 0.05833582522103205, + "language_loss": 0.89311475, + "learning_rate": 0.0009521319172912576, + "loss": 0.90422642, + "num_input_tokens_seen": 71187808, + "router_z_loss_mlp": 0.35009766, + "step": 864, + "time_per_iteration": 2.717493772506714 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01134798, + "balance_loss_mlp": 1.09846306, + "epoch": 0.16641015775298193, + "flos": 514292599296.0, + "grad_norm": 0.05644176285984134, + "language_loss": 0.94990546, + "learning_rate": 0.0009519988090229579, + "loss": 0.96125346, + "num_input_tokens_seen": 71261728, + "router_z_loss_mlp": 0.36352539, + "step": 865, + "time_per_iteration": 2.6850624084472656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133116, + "balance_loss_mlp": 1.09668565, + "epoch": 0.1666025394382455, + "flos": 621395338752.0, + "grad_norm": 0.05816643645022503, + "language_loss": 0.88684535, + "learning_rate": 0.0009518655252731576, + "loss": 0.89817655, + "num_input_tokens_seen": 71338352, + "router_z_loss_mlp": 0.36450195, + "step": 866, + "time_per_iteration": 2.7240021228790283 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124082, + "balance_loss_mlp": 1.0882715, + "epoch": 0.16679492112350905, + "flos": 548528329728.0, + "grad_norm": 0.06128727898968579, + "language_loss": 0.9070124, + "learning_rate": 0.0009517320660936022, + "loss": 0.91825324, + "num_input_tokens_seen": 71416544, + "router_z_loss_mlp": 0.35839844, + "step": 867, + "time_per_iteration": 2.6959731578826904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118134, + "balance_loss_mlp": 1.08260965, + "epoch": 0.1669873028087726, + "flos": 665379477504.0, + "grad_norm": 0.05857722537468161, + "language_loss": 0.83557463, + "learning_rate": 0.0009515984315361051, + "loss": 0.84675598, + "num_input_tokens_seen": 71494080, + "router_z_loss_mlp": 0.35546875, + "step": 868, + "time_per_iteration": 2.813674211502075 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01122458, + "balance_loss_mlp": 1.08638549, + "epoch": 0.16717968449403617, + "flos": 538305647616.0, + "grad_norm": 0.06553445455365839, + "language_loss": 0.87103701, + "learning_rate": 0.000951464621652548, + "loss": 0.88226151, + "num_input_tokens_seen": 71562672, + "router_z_loss_mlp": 0.36083984, + "step": 869, + "time_per_iteration": 2.674333333969116 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111253, + "balance_loss_mlp": 1.07757819, + "epoch": 0.16737206617929973, + "flos": 529833235968.0, + "grad_norm": 0.059309523866322815, + "language_loss": 0.78951609, + "learning_rate": 0.0009513306364948804, + "loss": 0.80064136, + "num_input_tokens_seen": 71641904, + "router_z_loss_mlp": 0.34985352, + "step": 870, + "time_per_iteration": 2.7519431114196777 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.011143, + "balance_loss_mlp": 1.07953846, + "epoch": 0.1675644478645633, + "flos": 480529732608.0, + "grad_norm": 0.06711563999134491, + "language_loss": 0.89559376, + "learning_rate": 0.0009511964761151197, + "loss": 0.90673673, + "num_input_tokens_seen": 71709616, + "router_z_loss_mlp": 0.34814453, + "step": 871, + "time_per_iteration": 2.544520854949951 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113298, + "balance_loss_mlp": 1.07820272, + "epoch": 0.16775682954982685, + "flos": 494311334400.0, + "grad_norm": 0.06484202096701225, + "language_loss": 0.9050945, + "learning_rate": 0.0009510621405653521, + "loss": 0.91622752, + "num_input_tokens_seen": 71776592, + "router_z_loss_mlp": 0.35131836, + "step": 872, + "time_per_iteration": 2.594224452972412 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01106918, + "balance_loss_mlp": 1.07265687, + "epoch": 0.1679492112350904, + "flos": 751694846976.0, + "grad_norm": 0.060317450015561574, + "language_loss": 0.847211, + "learning_rate": 0.0009509276298977309, + "loss": 0.85828018, + "num_input_tokens_seen": 71856352, + "router_z_loss_mlp": 0.34277344, + "step": 873, + "time_per_iteration": 2.9428915977478027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110568, + "balance_loss_mlp": 1.07261181, + "epoch": 0.168141592920354, + "flos": 1135413075456.0, + "grad_norm": 0.05441785661992682, + "language_loss": 0.81867516, + "learning_rate": 0.0009507929441644778, + "loss": 0.82978088, + "num_input_tokens_seen": 71948480, + "router_z_loss_mlp": 0.37939453, + "step": 874, + "time_per_iteration": 3.52008318901062 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101336, + "balance_loss_mlp": 1.06640816, + "epoch": 0.16833397460561755, + "flos": 632114205696.0, + "grad_norm": 0.06557720885733571, + "language_loss": 0.86201179, + "learning_rate": 0.0009506580834178826, + "loss": 0.87302518, + "num_input_tokens_seen": 72019200, + "router_z_loss_mlp": 0.34936523, + "step": 875, + "time_per_iteration": 2.7744014263153076 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110472, + "balance_loss_mlp": 1.06817079, + "epoch": 0.1685263562908811, + "flos": 541171943424.0, + "grad_norm": 0.06007828909359903, + "language_loss": 0.91612709, + "learning_rate": 0.0009505230477103028, + "loss": 0.92717427, + "num_input_tokens_seen": 72088672, + "router_z_loss_mlp": 0.36547852, + "step": 876, + "time_per_iteration": 2.6593635082244873 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103279, + "balance_loss_mlp": 1.06703997, + "epoch": 0.16871873797614467, + "flos": 619036812288.0, + "grad_norm": 0.08702038824672748, + "language_loss": 0.81312418, + "learning_rate": 0.0009503878370941641, + "loss": 0.824157, + "num_input_tokens_seen": 72159952, + "router_z_loss_mlp": 0.36206055, + "step": 877, + "time_per_iteration": 2.7511024475097656 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094199, + "balance_loss_mlp": 1.05986643, + "epoch": 0.16891111966140823, + "flos": 606067107840.0, + "grad_norm": 0.06953183101172467, + "language_loss": 0.88841844, + "learning_rate": 0.0009502524516219595, + "loss": 0.89936042, + "num_input_tokens_seen": 72231648, + "router_z_loss_mlp": 0.34375, + "step": 878, + "time_per_iteration": 2.697455406188965 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091575, + "balance_loss_mlp": 1.05757689, + "epoch": 0.1691035013466718, + "flos": 552058136064.0, + "grad_norm": 0.0721678347454753, + "language_loss": 0.89980447, + "learning_rate": 0.0009501168913462506, + "loss": 0.91072023, + "num_input_tokens_seen": 72298608, + "router_z_loss_mlp": 0.34008789, + "step": 879, + "time_per_iteration": 2.6825287342071533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080465, + "balance_loss_mlp": 1.06263125, + "epoch": 0.16929588303193535, + "flos": 1475544121344.0, + "grad_norm": 0.044515803528062385, + "language_loss": 0.79121923, + "learning_rate": 0.0009499811563196665, + "loss": 0.80202389, + "num_input_tokens_seen": 72525312, + "router_z_loss_mlp": 0.17871094, + "step": 880, + "time_per_iteration": 4.825777769088745 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081319, + "balance_loss_mlp": 1.0464623, + "epoch": 0.1694882647171989, + "flos": 925850456064.0, + "grad_norm": 0.06491790696384477, + "language_loss": 0.85360616, + "learning_rate": 0.0009498452465949042, + "loss": 0.86441934, + "num_input_tokens_seen": 72612976, + "router_z_loss_mlp": 0.34887695, + "step": 881, + "time_per_iteration": 3.2700376510620117 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086319, + "balance_loss_mlp": 1.05227244, + "epoch": 0.1696806464024625, + "flos": 545829359616.0, + "grad_norm": 0.057533624801199786, + "language_loss": 0.916857, + "learning_rate": 0.0009497091622247285, + "loss": 0.92772019, + "num_input_tokens_seen": 72686800, + "router_z_loss_mlp": 0.34082031, + "step": 882, + "time_per_iteration": 2.711721181869507 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085559, + "balance_loss_mlp": 1.05184698, + "epoch": 0.16987302808772606, + "flos": 528970466304.0, + "grad_norm": 0.08384615451013337, + "language_loss": 0.93744707, + "learning_rate": 0.0009495729032619723, + "loss": 0.94830269, + "num_input_tokens_seen": 72759360, + "router_z_loss_mlp": 0.33740234, + "step": 883, + "time_per_iteration": 2.688525438308716 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084621, + "balance_loss_mlp": 1.05062199, + "epoch": 0.17006540977298962, + "flos": 754855096320.0, + "grad_norm": 0.06073677328113264, + "language_loss": 0.84419179, + "learning_rate": 0.0009494364697595354, + "loss": 0.85503805, + "num_input_tokens_seen": 72831424, + "router_z_loss_mlp": 0.34033203, + "step": 884, + "time_per_iteration": 2.9112000465393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092434, + "balance_loss_mlp": 1.05750597, + "epoch": 0.17025779145825318, + "flos": 558532813824.0, + "grad_norm": 0.06728326387015754, + "language_loss": 0.89818925, + "learning_rate": 0.0009492998617703867, + "loss": 0.90911365, + "num_input_tokens_seen": 72901536, + "router_z_loss_mlp": 0.34936523, + "step": 885, + "time_per_iteration": 2.6760926246643066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093981, + "balance_loss_mlp": 1.06045985, + "epoch": 0.17045017314351674, + "flos": 511963186176.0, + "grad_norm": 0.0687386743468794, + "language_loss": 0.87971282, + "learning_rate": 0.0009491630793475619, + "loss": 0.89065266, + "num_input_tokens_seen": 72970480, + "router_z_loss_mlp": 0.33520508, + "step": 886, + "time_per_iteration": 2.59726619720459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094476, + "balance_loss_mlp": 1.06011951, + "epoch": 0.1706425548287803, + "flos": 508674898944.0, + "grad_norm": 0.058204707286146434, + "language_loss": 0.85722017, + "learning_rate": 0.0009490261225441643, + "loss": 0.8681649, + "num_input_tokens_seen": 73053376, + "router_z_loss_mlp": 0.34350586, + "step": 887, + "time_per_iteration": 2.900501012802124 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087689, + "balance_loss_mlp": 1.0545013, + "epoch": 0.17083493651404386, + "flos": 717016776192.0, + "grad_norm": 0.05310353290702558, + "language_loss": 0.90775931, + "learning_rate": 0.0009488889914133656, + "loss": 0.9186362, + "num_input_tokens_seen": 73136032, + "router_z_loss_mlp": 0.33203125, + "step": 888, + "time_per_iteration": 2.992532968521118 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089684, + "balance_loss_mlp": 1.05520868, + "epoch": 0.17102731819930742, + "flos": 558852908544.0, + "grad_norm": 0.047287767355612194, + "language_loss": 0.88680297, + "learning_rate": 0.0009487516860084047, + "loss": 0.89769983, + "num_input_tokens_seen": 73208544, + "router_z_loss_mlp": 0.34472656, + "step": 889, + "time_per_iteration": 2.7029643058776855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082798, + "balance_loss_mlp": 1.04858518, + "epoch": 0.17121969988457098, + "flos": 494542679040.0, + "grad_norm": 0.0765590367769256, + "language_loss": 0.88680983, + "learning_rate": 0.0009486142063825884, + "loss": 0.89763772, + "num_input_tokens_seen": 73274336, + "router_z_loss_mlp": 0.34228516, + "step": 890, + "time_per_iteration": 2.5640931129455566 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01038716, + "balance_loss_mlp": 1.02402985, + "epoch": 0.17141208156983456, + "flos": 1548088063488.0, + "grad_norm": 0.02832238153451814, + "language_loss": 0.72426212, + "learning_rate": 0.0009484765525892909, + "loss": 0.7346493, + "num_input_tokens_seen": 73506320, + "router_z_loss_mlp": 0.14648438, + "step": 891, + "time_per_iteration": 4.979609251022339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108867, + "balance_loss_mlp": 1.0540278, + "epoch": 0.17160446325509812, + "flos": 619282713600.0, + "grad_norm": 0.06449268303648867, + "language_loss": 0.90758598, + "learning_rate": 0.0009483387246819542, + "loss": 0.91847265, + "num_input_tokens_seen": 73578048, + "router_z_loss_mlp": 0.34667969, + "step": 892, + "time_per_iteration": 2.731332540512085 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_mlp": 1.01767898, + "epoch": 0.17179684494036168, + "flos": 1381026972672.0, + "grad_norm": 0.016720826063608682, + "language_loss": 0.82285583, + "learning_rate": 0.0009482007227140877, + "loss": 0.83318138, + "num_input_tokens_seen": 73798640, + "router_z_loss_mlp": 0.1484375, + "step": 893, + "time_per_iteration": 4.641844987869263 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097484, + "balance_loss_mlp": 1.06386662, + "epoch": 0.17198922662562524, + "flos": 492386383872.0, + "grad_norm": 0.05711411270468228, + "language_loss": 0.89587665, + "learning_rate": 0.0009480625467392688, + "loss": 0.90685147, + "num_input_tokens_seen": 73867328, + "router_z_loss_mlp": 0.33642578, + "step": 894, + "time_per_iteration": 2.6310250759124756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01033123, + "balance_loss_mlp": 1.01795936, + "epoch": 0.1721816083108888, + "flos": 1457529914880.0, + "grad_norm": 0.013728573618451478, + "language_loss": 0.77994668, + "learning_rate": 0.0009479241968111421, + "loss": 0.79027796, + "num_input_tokens_seen": 74093376, + "router_z_loss_mlp": 0.15136719, + "step": 895, + "time_per_iteration": 4.7525646686553955 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132534, + "balance_loss_mlp": 1.09834456, + "epoch": 0.17237398999615236, + "flos": 527853030912.0, + "grad_norm": 0.05821127752563967, + "language_loss": 0.87793648, + "learning_rate": 0.0009477856729834196, + "loss": 0.88926184, + "num_input_tokens_seen": 74169136, + "router_z_loss_mlp": 0.34228516, + "step": 896, + "time_per_iteration": 2.7015438079833984 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01132108, + "balance_loss_mlp": 1.09901524, + "epoch": 0.17256637168141592, + "flos": 603644562432.0, + "grad_norm": 0.08337200045302615, + "language_loss": 0.9056648, + "learning_rate": 0.0009476469753098809, + "loss": 0.91698587, + "num_input_tokens_seen": 74236912, + "router_z_loss_mlp": 0.33105469, + "step": 897, + "time_per_iteration": 2.7035813331604004 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01125108, + "balance_loss_mlp": 1.08922589, + "epoch": 0.17275875336667948, + "flos": 509437334016.0, + "grad_norm": 0.05742024530278536, + "language_loss": 0.874506, + "learning_rate": 0.0009475081038443738, + "loss": 0.88575709, + "num_input_tokens_seen": 74305968, + "router_z_loss_mlp": 0.35913086, + "step": 898, + "time_per_iteration": 2.584437370300293 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115669, + "balance_loss_mlp": 1.07971573, + "epoch": 0.17295113505194307, + "flos": 664951693824.0, + "grad_norm": 0.06535241228499304, + "language_loss": 0.85809892, + "learning_rate": 0.0009473690586408124, + "loss": 0.86925566, + "num_input_tokens_seen": 74384144, + "router_z_loss_mlp": 0.35986328, + "step": 899, + "time_per_iteration": 2.83156418800354 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01116393, + "balance_loss_mlp": 1.08084452, + "epoch": 0.17314351673720663, + "flos": 555125253120.0, + "grad_norm": 0.0683413775827569, + "language_loss": 0.86639923, + "learning_rate": 0.0009472298397531792, + "loss": 0.87756318, + "num_input_tokens_seen": 74455040, + "router_z_loss_mlp": 0.35546875, + "step": 900, + "time_per_iteration": 2.6944193840026855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01123337, + "balance_loss_mlp": 1.08635855, + "epoch": 0.17333589842247019, + "flos": 503361326592.0, + "grad_norm": 0.09670394547256775, + "language_loss": 0.87118709, + "learning_rate": 0.0009470904472355235, + "loss": 0.88242042, + "num_input_tokens_seen": 74525248, + "router_z_loss_mlp": 0.36987305, + "step": 901, + "time_per_iteration": 2.637882709503174 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114178, + "balance_loss_mlp": 1.07982159, + "epoch": 0.17352828010773375, + "flos": 555924003840.0, + "grad_norm": 0.06358596699153923, + "language_loss": 0.79912066, + "learning_rate": 0.0009469508811419626, + "loss": 0.81026244, + "num_input_tokens_seen": 74597328, + "router_z_loss_mlp": 0.34399414, + "step": 902, + "time_per_iteration": 2.726072311401367 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077408, + "balance_loss_mlp": 1.06453359, + "epoch": 0.1737206617929973, + "flos": 1553711556096.0, + "grad_norm": 0.030950293127884103, + "language_loss": 0.7161383, + "learning_rate": 0.0009468111415266806, + "loss": 0.72691238, + "num_input_tokens_seen": 74819664, + "router_z_loss_mlp": 0.12890625, + "step": 903, + "time_per_iteration": 4.791790723800659 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109546, + "balance_loss_mlp": 1.07445073, + "epoch": 0.17391304347826086, + "flos": 516390667776.0, + "grad_norm": 0.06883251868009001, + "language_loss": 0.84220147, + "learning_rate": 0.0009466712284439292, + "loss": 0.85329694, + "num_input_tokens_seen": 74896224, + "router_z_loss_mlp": 0.35131836, + "step": 904, + "time_per_iteration": 2.7648017406463623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104504, + "balance_loss_mlp": 1.06995738, + "epoch": 0.17410542516352442, + "flos": 540773273088.0, + "grad_norm": 0.06988851938988141, + "language_loss": 0.8903957, + "learning_rate": 0.0009465311419480276, + "loss": 0.90144074, + "num_input_tokens_seen": 74966560, + "router_z_loss_mlp": 0.34545898, + "step": 905, + "time_per_iteration": 2.725829601287842 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098399, + "balance_loss_mlp": 1.0637325, + "epoch": 0.17429780684878798, + "flos": 623542869504.0, + "grad_norm": 0.06312030659776342, + "language_loss": 0.88624233, + "learning_rate": 0.0009463908820933622, + "loss": 0.89722633, + "num_input_tokens_seen": 75045248, + "router_z_loss_mlp": 0.34692383, + "step": 906, + "time_per_iteration": 2.8389482498168945 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093859, + "balance_loss_mlp": 1.05900264, + "epoch": 0.17449018853405157, + "flos": 575368386048.0, + "grad_norm": 0.056066721215551084, + "language_loss": 0.83138871, + "learning_rate": 0.0009462504489343868, + "loss": 0.84232736, + "num_input_tokens_seen": 75123952, + "router_z_loss_mlp": 0.34863281, + "step": 907, + "time_per_iteration": 2.863349437713623 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086469, + "balance_loss_mlp": 1.05199337, + "epoch": 0.17468257021931513, + "flos": 533499844608.0, + "grad_norm": 0.07604190500703253, + "language_loss": 0.894853, + "learning_rate": 0.0009461098425256222, + "loss": 0.90571761, + "num_input_tokens_seen": 75191728, + "router_z_loss_mlp": 0.3449707, + "step": 908, + "time_per_iteration": 2.5941011905670166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108834, + "balance_loss_mlp": 1.05345941, + "epoch": 0.1748749519045787, + "flos": 540496848384.0, + "grad_norm": 0.050694136543679796, + "language_loss": 0.85873353, + "learning_rate": 0.0009459690629216567, + "loss": 0.86961693, + "num_input_tokens_seen": 75262224, + "router_z_loss_mlp": 0.34887695, + "step": 909, + "time_per_iteration": 2.6097571849823 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109032, + "balance_loss_mlp": 1.0570612, + "epoch": 0.17506733358984225, + "flos": 498373641216.0, + "grad_norm": 0.0569262349138849, + "language_loss": 0.88138729, + "learning_rate": 0.0009458281101771457, + "loss": 0.89229047, + "num_input_tokens_seen": 75329760, + "router_z_loss_mlp": 0.33276367, + "step": 910, + "time_per_iteration": 2.5904784202575684 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091588, + "balance_loss_mlp": 1.05744696, + "epoch": 0.1752597152751058, + "flos": 622621873152.0, + "grad_norm": 0.06350455217589325, + "language_loss": 0.8266046, + "learning_rate": 0.0009456869843468122, + "loss": 0.83752048, + "num_input_tokens_seen": 75407920, + "router_z_loss_mlp": 0.34179688, + "step": 911, + "time_per_iteration": 2.8930556774139404 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090023, + "balance_loss_mlp": 1.05476046, + "epoch": 0.17545209696036937, + "flos": 520717814784.0, + "grad_norm": 0.07844481886152296, + "language_loss": 0.79097009, + "learning_rate": 0.0009455456854854459, + "loss": 0.80187035, + "num_input_tokens_seen": 75476752, + "router_z_loss_mlp": 0.35302734, + "step": 912, + "time_per_iteration": 2.5984511375427246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096028, + "balance_loss_mlp": 1.0631026, + "epoch": 0.17564447864563293, + "flos": 461750270976.0, + "grad_norm": 0.05516798292623818, + "language_loss": 0.84505737, + "learning_rate": 0.0009454042136479039, + "loss": 0.85601771, + "num_input_tokens_seen": 75542944, + "router_z_loss_mlp": 0.3293457, + "step": 913, + "time_per_iteration": 2.586195945739746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085981, + "balance_loss_mlp": 1.05286503, + "epoch": 0.1758368603308965, + "flos": 480416251392.0, + "grad_norm": 0.05301404729603274, + "language_loss": 0.83308446, + "learning_rate": 0.0009452625688891103, + "loss": 0.84394431, + "num_input_tokens_seen": 75609840, + "router_z_loss_mlp": 0.33129883, + "step": 914, + "time_per_iteration": 2.5374929904937744 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01052517, + "balance_loss_mlp": 1.038975, + "epoch": 0.17602924201616005, + "flos": 1478160133632.0, + "grad_norm": 0.03507986977902886, + "language_loss": 0.78734738, + "learning_rate": 0.0009451207512640567, + "loss": 0.79787254, + "num_input_tokens_seen": 75819312, + "router_z_loss_mlp": 0.13574219, + "step": 915, + "time_per_iteration": 4.5561583042144775 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097389, + "balance_loss_mlp": 1.06226993, + "epoch": 0.17622162370142364, + "flos": 602010593280.0, + "grad_norm": 0.06815502965849334, + "language_loss": 0.93451321, + "learning_rate": 0.0009449787608278015, + "loss": 0.94548714, + "num_input_tokens_seen": 75893984, + "router_z_loss_mlp": 0.35131836, + "step": 916, + "time_per_iteration": 2.7807908058166504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092918, + "balance_loss_mlp": 1.0588007, + "epoch": 0.1764140053866872, + "flos": 442473214464.0, + "grad_norm": 0.0637680644109211, + "language_loss": 0.92700857, + "learning_rate": 0.0009448365976354704, + "loss": 0.93793774, + "num_input_tokens_seen": 75958944, + "router_z_loss_mlp": 0.34130859, + "step": 917, + "time_per_iteration": 2.4800124168395996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105643, + "balance_loss_mlp": 1.06909323, + "epoch": 0.17660638707195075, + "flos": 500362610688.0, + "grad_norm": 0.07080486598597346, + "language_loss": 0.90158784, + "learning_rate": 0.0009446942617422558, + "loss": 0.91264427, + "num_input_tokens_seen": 76024240, + "router_z_loss_mlp": 0.36547852, + "step": 918, + "time_per_iteration": 2.5415430068969727 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101469, + "balance_loss_mlp": 1.06766129, + "epoch": 0.17679876875721431, + "flos": 538621360128.0, + "grad_norm": 0.060000223973742446, + "language_loss": 0.86201262, + "learning_rate": 0.0009445517532034176, + "loss": 0.87302732, + "num_input_tokens_seen": 76095264, + "router_z_loss_mlp": 0.33789062, + "step": 919, + "time_per_iteration": 2.6849868297576904 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01121669, + "balance_loss_mlp": 1.08569145, + "epoch": 0.17699115044247787, + "flos": 497477376000.0, + "grad_norm": 0.08221632690768264, + "language_loss": 0.89522099, + "learning_rate": 0.0009444090720742824, + "loss": 0.9064377, + "num_input_tokens_seen": 76163520, + "router_z_loss_mlp": 0.35986328, + "step": 920, + "time_per_iteration": 2.6034600734710693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113572, + "balance_loss_mlp": 1.07883418, + "epoch": 0.17718353212774143, + "flos": 662444780544.0, + "grad_norm": 0.08029288241638204, + "language_loss": 0.88040781, + "learning_rate": 0.0009442662184102439, + "loss": 0.89154357, + "num_input_tokens_seen": 76233760, + "router_z_loss_mlp": 0.34741211, + "step": 921, + "time_per_iteration": 2.767352342605591 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105947, + "balance_loss_mlp": 1.07309294, + "epoch": 0.177375913813005, + "flos": 582340658688.0, + "grad_norm": 0.0705507668945597, + "language_loss": 0.87951338, + "learning_rate": 0.000944123192266763, + "loss": 0.89057279, + "num_input_tokens_seen": 76310704, + "router_z_loss_mlp": 0.32836914, + "step": 922, + "time_per_iteration": 2.789315700531006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108289, + "balance_loss_mlp": 1.0727644, + "epoch": 0.17756829549826855, + "flos": 552285098496.0, + "grad_norm": 0.06115562628552814, + "language_loss": 0.83835006, + "learning_rate": 0.0009439799936993671, + "loss": 0.84943295, + "num_input_tokens_seen": 76386992, + "router_z_loss_mlp": 0.35546875, + "step": 923, + "time_per_iteration": 2.7160987854003906 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090669, + "balance_loss_mlp": 1.05733824, + "epoch": 0.17776067718353214, + "flos": 556060806144.0, + "grad_norm": 0.07059184324253498, + "language_loss": 0.88508618, + "learning_rate": 0.0009438366227636511, + "loss": 0.89599288, + "num_input_tokens_seen": 76453328, + "router_z_loss_mlp": 0.33349609, + "step": 924, + "time_per_iteration": 2.6319191455841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084897, + "balance_loss_mlp": 1.05163789, + "epoch": 0.1779530588687957, + "flos": 658161303552.0, + "grad_norm": 0.06263940487075517, + "language_loss": 0.86677843, + "learning_rate": 0.0009436930795152763, + "loss": 0.87762737, + "num_input_tokens_seen": 76529040, + "router_z_loss_mlp": 0.33276367, + "step": 925, + "time_per_iteration": 2.8063783645629883 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084326, + "balance_loss_mlp": 1.05159163, + "epoch": 0.17814544055405926, + "flos": 644187644928.0, + "grad_norm": 0.06448697412821461, + "language_loss": 0.8710525, + "learning_rate": 0.0009435493640099713, + "loss": 0.88189578, + "num_input_tokens_seen": 76604080, + "router_z_loss_mlp": 0.32739258, + "step": 926, + "time_per_iteration": 2.7599081993103027 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01080787, + "balance_loss_mlp": 1.04664516, + "epoch": 0.17833782223932282, + "flos": 460672123392.0, + "grad_norm": 0.06497730431564504, + "language_loss": 0.84328961, + "learning_rate": 0.0009434054763035314, + "loss": 0.85409749, + "num_input_tokens_seen": 76674096, + "router_z_loss_mlp": 0.34155273, + "step": 927, + "time_per_iteration": 2.612910032272339 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081381, + "balance_loss_mlp": 1.04740596, + "epoch": 0.17853020392458638, + "flos": 759212766720.0, + "grad_norm": 0.04594292129212818, + "language_loss": 0.85898727, + "learning_rate": 0.0009432614164518185, + "loss": 0.8698011, + "num_input_tokens_seen": 76752144, + "router_z_loss_mlp": 0.33984375, + "step": 928, + "time_per_iteration": 2.926981210708618 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086846, + "balance_loss_mlp": 1.05153632, + "epoch": 0.17872258560984994, + "flos": 782320785408.0, + "grad_norm": 0.055185850673896385, + "language_loss": 0.84792197, + "learning_rate": 0.000943117184510762, + "loss": 0.85879046, + "num_input_tokens_seen": 76830240, + "router_z_loss_mlp": 0.35327148, + "step": 929, + "time_per_iteration": 2.995514154434204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01039861, + "balance_loss_mlp": 1.02660513, + "epoch": 0.1789149672951135, + "flos": 1459095482880.0, + "grad_norm": 0.021362691821678215, + "language_loss": 0.78789961, + "learning_rate": 0.0009429727805363575, + "loss": 0.79829824, + "num_input_tokens_seen": 77062464, + "router_z_loss_mlp": 0.1328125, + "step": 930, + "time_per_iteration": 4.99839448928833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091678, + "balance_loss_mlp": 1.05739331, + "epoch": 0.17910734898037706, + "flos": 503598463488.0, + "grad_norm": 0.05761618473313655, + "language_loss": 0.88773429, + "learning_rate": 0.0009428282045846674, + "loss": 0.89865112, + "num_input_tokens_seen": 77136672, + "router_z_loss_mlp": 0.34301758, + "step": 931, + "time_per_iteration": 2.6966652870178223 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087828, + "balance_loss_mlp": 1.05452061, + "epoch": 0.17929973066564064, + "flos": 745895264256.0, + "grad_norm": 0.05798282919409206, + "language_loss": 0.89983928, + "learning_rate": 0.0009426834567118214, + "loss": 0.91071755, + "num_input_tokens_seen": 77227040, + "router_z_loss_mlp": 0.33300781, + "step": 932, + "time_per_iteration": 3.072160482406616 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092765, + "balance_loss_mlp": 1.05907631, + "epoch": 0.1794921123509042, + "flos": 712875893760.0, + "grad_norm": 0.055390897890994044, + "language_loss": 0.80879378, + "learning_rate": 0.0009425385369740155, + "loss": 0.81972146, + "num_input_tokens_seen": 77319392, + "router_z_loss_mlp": 0.3371582, + "step": 933, + "time_per_iteration": 3.0337042808532715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092731, + "balance_loss_mlp": 1.05825567, + "epoch": 0.17968449403616776, + "flos": 632838763008.0, + "grad_norm": 0.0687685702394307, + "language_loss": 0.87443584, + "learning_rate": 0.0009423934454275125, + "loss": 0.8853631, + "num_input_tokens_seen": 77394688, + "router_z_loss_mlp": 0.3449707, + "step": 934, + "time_per_iteration": 2.7970879077911377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089195, + "balance_loss_mlp": 1.05526757, + "epoch": 0.17987687572143132, + "flos": 536060602368.0, + "grad_norm": 0.08214865293258214, + "language_loss": 0.92215371, + "learning_rate": 0.0009422481821286418, + "loss": 0.93304563, + "num_input_tokens_seen": 77468288, + "router_z_loss_mlp": 0.33959961, + "step": 935, + "time_per_iteration": 2.7134642601013184 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091708, + "balance_loss_mlp": 1.05914021, + "epoch": 0.18006925740669488, + "flos": 537818227200.0, + "grad_norm": 0.0718764173736199, + "language_loss": 0.87967253, + "learning_rate": 0.0009421027471337998, + "loss": 0.89058959, + "num_input_tokens_seen": 77535840, + "router_z_loss_mlp": 0.32568359, + "step": 936, + "time_per_iteration": 2.608764171600342 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098474, + "balance_loss_mlp": 1.06333113, + "epoch": 0.18026163909195844, + "flos": 539255757312.0, + "grad_norm": 0.06697051800305152, + "language_loss": 0.82882118, + "learning_rate": 0.0009419571404994493, + "loss": 0.83980596, + "num_input_tokens_seen": 77604000, + "router_z_loss_mlp": 0.3515625, + "step": 937, + "time_per_iteration": 2.620296001434326 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096306, + "balance_loss_mlp": 1.06240284, + "epoch": 0.180454020777222, + "flos": 500382959616.0, + "grad_norm": 0.08555714620461663, + "language_loss": 0.90948844, + "learning_rate": 0.00094181136228212, + "loss": 0.92045152, + "num_input_tokens_seen": 77671488, + "router_z_loss_mlp": 0.33935547, + "step": 938, + "time_per_iteration": 2.62837290763855 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109732, + "balance_loss_mlp": 1.06415629, + "epoch": 0.18064640246248556, + "flos": 498689353728.0, + "grad_norm": 0.06983123921060745, + "language_loss": 0.86323059, + "learning_rate": 0.0009416654125384077, + "loss": 0.8742038, + "num_input_tokens_seen": 77746240, + "router_z_loss_mlp": 0.33154297, + "step": 939, + "time_per_iteration": 2.715686321258545 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01054723, + "balance_loss_mlp": 1.04242051, + "epoch": 0.18083878414774912, + "flos": 1518572358144.0, + "grad_norm": 0.027679884047562747, + "language_loss": 0.79772377, + "learning_rate": 0.0009415192913249752, + "loss": 0.80827093, + "num_input_tokens_seen": 77966080, + "router_z_loss_mlp": 0.12304688, + "step": 940, + "time_per_iteration": 4.941875219345093 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090642, + "balance_loss_mlp": 1.05728722, + "epoch": 0.1810311658330127, + "flos": 727006703616.0, + "grad_norm": 0.07011009980003599, + "language_loss": 0.84326053, + "learning_rate": 0.000941372998698552, + "loss": 0.85416698, + "num_input_tokens_seen": 78049200, + "router_z_loss_mlp": 0.33374023, + "step": 941, + "time_per_iteration": 2.931520938873291 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094264, + "balance_loss_mlp": 1.0597409, + "epoch": 0.18122354751827627, + "flos": 564643726848.0, + "grad_norm": 0.08254502738164117, + "language_loss": 0.8207435, + "learning_rate": 0.0009412265347159336, + "loss": 0.8316862, + "num_input_tokens_seen": 78122752, + "router_z_loss_mlp": 0.34570312, + "step": 942, + "time_per_iteration": 2.696354627609253 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091238, + "balance_loss_mlp": 1.05869377, + "epoch": 0.18141592920353983, + "flos": 519024208896.0, + "grad_norm": 0.05729066672306875, + "language_loss": 0.85217965, + "learning_rate": 0.0009410798994339829, + "loss": 0.86309201, + "num_input_tokens_seen": 78194064, + "router_z_loss_mlp": 0.32543945, + "step": 943, + "time_per_iteration": 2.6009600162506104 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088013, + "balance_loss_mlp": 1.0545156, + "epoch": 0.1816083108888034, + "flos": 512219261952.0, + "grad_norm": 0.05342615519744699, + "language_loss": 0.88234782, + "learning_rate": 0.000940933092909628, + "loss": 0.89322793, + "num_input_tokens_seen": 78262048, + "router_z_loss_mlp": 0.33520508, + "step": 944, + "time_per_iteration": 2.618419647216797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095526, + "balance_loss_mlp": 1.06286263, + "epoch": 0.18180069257406695, + "flos": 492144864768.0, + "grad_norm": 0.053227732023653135, + "language_loss": 0.8393383, + "learning_rate": 0.0009407861151998649, + "loss": 0.85029352, + "num_input_tokens_seen": 78330624, + "router_z_loss_mlp": 0.32666016, + "step": 945, + "time_per_iteration": 2.5718705654144287 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097774, + "balance_loss_mlp": 1.06406188, + "epoch": 0.1819930742593305, + "flos": 569891870208.0, + "grad_norm": 0.05775782434029923, + "language_loss": 0.86156505, + "learning_rate": 0.0009406389663617552, + "loss": 0.87254274, + "num_input_tokens_seen": 78400672, + "router_z_loss_mlp": 0.33740234, + "step": 946, + "time_per_iteration": 2.66513729095459 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097522, + "balance_loss_mlp": 1.06433463, + "epoch": 0.18218545594459407, + "flos": 605693168640.0, + "grad_norm": 0.06350431386522506, + "language_loss": 0.85736459, + "learning_rate": 0.000940491646452427, + "loss": 0.86833978, + "num_input_tokens_seen": 78467952, + "router_z_loss_mlp": 0.33203125, + "step": 947, + "time_per_iteration": 2.715071201324463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01103916, + "balance_loss_mlp": 1.07010818, + "epoch": 0.18237783762985763, + "flos": 548419230720.0, + "grad_norm": 0.06277969821047595, + "language_loss": 0.91195452, + "learning_rate": 0.000940344155529075, + "loss": 0.92299366, + "num_input_tokens_seen": 78538928, + "router_z_loss_mlp": 0.33837891, + "step": 948, + "time_per_iteration": 2.6502938270568848 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01099574, + "balance_loss_mlp": 1.06550407, + "epoch": 0.1825702193151212, + "flos": 450509078016.0, + "grad_norm": 0.06933176029299125, + "language_loss": 0.87683523, + "learning_rate": 0.0009401964936489605, + "loss": 0.88783091, + "num_input_tokens_seen": 78602144, + "router_z_loss_mlp": 0.34106445, + "step": 949, + "time_per_iteration": 2.5181798934936523 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084135, + "balance_loss_mlp": 1.05247355, + "epoch": 0.18276260100038477, + "flos": 588962313216.0, + "grad_norm": 0.07980064544074586, + "language_loss": 0.85422635, + "learning_rate": 0.0009400486608694108, + "loss": 0.86506772, + "num_input_tokens_seen": 78673152, + "router_z_loss_mlp": 0.31640625, + "step": 950, + "time_per_iteration": 2.7189955711364746 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087871, + "balance_loss_mlp": 1.05384839, + "epoch": 0.18295498268564833, + "flos": 786988376064.0, + "grad_norm": 0.05265351460276348, + "language_loss": 0.87225658, + "learning_rate": 0.0009399006572478195, + "loss": 0.88313532, + "num_input_tokens_seen": 78753872, + "router_z_loss_mlp": 0.34033203, + "step": 951, + "time_per_iteration": 3.0805773735046387 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01086089, + "balance_loss_mlp": 1.05218577, + "epoch": 0.1831473643709119, + "flos": 577878271488.0, + "grad_norm": 0.059447924131550096, + "language_loss": 0.91242015, + "learning_rate": 0.0009397524828416468, + "loss": 0.92328107, + "num_input_tokens_seen": 78822640, + "router_z_loss_mlp": 0.33935547, + "step": 952, + "time_per_iteration": 2.6567108631134033 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082155, + "balance_loss_mlp": 1.04801321, + "epoch": 0.18333974605617545, + "flos": 566622521856.0, + "grad_norm": 0.05513512337372911, + "language_loss": 0.96184212, + "learning_rate": 0.0009396041377084192, + "loss": 0.97266364, + "num_input_tokens_seen": 78893792, + "router_z_loss_mlp": 0.34179688, + "step": 953, + "time_per_iteration": 2.6937921047210693 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079403, + "balance_loss_mlp": 1.04478431, + "epoch": 0.183532127741439, + "flos": 526725421056.0, + "grad_norm": 0.07204875194033089, + "language_loss": 0.87840325, + "learning_rate": 0.0009394556219057295, + "loss": 0.88919723, + "num_input_tokens_seen": 78964752, + "router_z_loss_mlp": 0.34667969, + "step": 954, + "time_per_iteration": 2.6962215900421143 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0107777, + "balance_loss_mlp": 1.04272258, + "epoch": 0.18372450942670257, + "flos": 594259918848.0, + "grad_norm": 0.07227161235955501, + "language_loss": 0.83883446, + "learning_rate": 0.0009393069354912362, + "loss": 0.84961212, + "num_input_tokens_seen": 79034400, + "router_z_loss_mlp": 0.35058594, + "step": 955, + "time_per_iteration": 2.718308925628662 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081248, + "balance_loss_mlp": 1.04677236, + "epoch": 0.18391689111196613, + "flos": 644720145408.0, + "grad_norm": 0.07091738302891186, + "language_loss": 0.82511717, + "learning_rate": 0.0009391580785226649, + "loss": 0.83592963, + "num_input_tokens_seen": 79109488, + "router_z_loss_mlp": 0.34521484, + "step": 956, + "time_per_iteration": 2.907367467880249 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01077991, + "balance_loss_mlp": 1.06216049, + "epoch": 0.18410927279722972, + "flos": 1456246563840.0, + "grad_norm": 0.048423099914415325, + "language_loss": 0.79340446, + "learning_rate": 0.0009390090510578067, + "loss": 0.80418444, + "num_input_tokens_seen": 79327712, + "router_z_loss_mlp": 0.15820312, + "step": 957, + "time_per_iteration": 4.78663969039917 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091525, + "balance_loss_mlp": 1.05702567, + "epoch": 0.18430165448249328, + "flos": 658437728256.0, + "grad_norm": 0.09319397884021513, + "language_loss": 0.86484683, + "learning_rate": 0.0009388598531545196, + "loss": 0.8757621, + "num_input_tokens_seen": 79401504, + "router_z_loss_mlp": 0.34545898, + "step": 958, + "time_per_iteration": 2.8470118045806885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087326, + "balance_loss_mlp": 1.05285025, + "epoch": 0.18449403616775684, + "flos": 517679811072.0, + "grad_norm": 0.07377492103556435, + "language_loss": 0.86076611, + "learning_rate": 0.000938710484870727, + "loss": 0.87163937, + "num_input_tokens_seen": 79466688, + "router_z_loss_mlp": 0.3449707, + "step": 959, + "time_per_iteration": 2.5930731296539307 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090986, + "balance_loss_mlp": 1.05672574, + "epoch": 0.1846864178530204, + "flos": 552481537536.0, + "grad_norm": 0.06589557505977534, + "language_loss": 0.86379164, + "learning_rate": 0.0009385609462644189, + "loss": 0.8747015, + "num_input_tokens_seen": 79540288, + "router_z_loss_mlp": 0.34277344, + "step": 960, + "time_per_iteration": 2.688706636428833 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096456, + "balance_loss_mlp": 1.06212378, + "epoch": 0.18487879953828396, + "flos": 465930441216.0, + "grad_norm": 0.0643439417949763, + "language_loss": 0.86035949, + "learning_rate": 0.0009384112373936514, + "loss": 0.871324, + "num_input_tokens_seen": 79611872, + "router_z_loss_mlp": 0.34326172, + "step": 961, + "time_per_iteration": 4.0801496505737305 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095895, + "balance_loss_mlp": 1.06132412, + "epoch": 0.18507118122354752, + "flos": 648200489472.0, + "grad_norm": 0.0614591664996872, + "language_loss": 0.91820455, + "learning_rate": 0.0009382613583165467, + "loss": 0.92916346, + "num_input_tokens_seen": 79689504, + "router_z_loss_mlp": 0.34594727, + "step": 962, + "time_per_iteration": 2.790069341659546 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113098, + "balance_loss_mlp": 1.07921863, + "epoch": 0.18526356290881107, + "flos": 626486330880.0, + "grad_norm": 0.06374556186760763, + "language_loss": 0.89594233, + "learning_rate": 0.0009381113090912928, + "loss": 0.90707326, + "num_input_tokens_seen": 79759264, + "router_z_loss_mlp": 0.33886719, + "step": 963, + "time_per_iteration": 2.6891098022460938 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01117951, + "balance_loss_mlp": 1.08559799, + "epoch": 0.18545594459407463, + "flos": 432497843712.0, + "grad_norm": 0.06491910119233056, + "language_loss": 0.90103394, + "learning_rate": 0.000937961089776144, + "loss": 0.91221344, + "num_input_tokens_seen": 79824464, + "router_z_loss_mlp": 0.32348633, + "step": 964, + "time_per_iteration": 2.5821962356567383 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01124554, + "balance_loss_mlp": 1.08926833, + "epoch": 0.1856483262793382, + "flos": 748720862208.0, + "grad_norm": 0.06849062336391444, + "language_loss": 0.829036, + "learning_rate": 0.0009378107004294208, + "loss": 0.84028149, + "num_input_tokens_seen": 79907152, + "router_z_loss_mlp": 0.35302734, + "step": 965, + "time_per_iteration": 2.9898061752319336 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115255, + "balance_loss_mlp": 1.081972, + "epoch": 0.18584070796460178, + "flos": 530058788352.0, + "grad_norm": 0.08647217477609576, + "language_loss": 0.91352308, + "learning_rate": 0.0009376601411095096, + "loss": 0.92467564, + "num_input_tokens_seen": 79976944, + "router_z_loss_mlp": 0.33300781, + "step": 966, + "time_per_iteration": 2.6415059566497803 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093436, + "balance_loss_mlp": 1.06196475, + "epoch": 0.18603308964986534, + "flos": 482863527936.0, + "grad_norm": 0.05783783242438048, + "language_loss": 0.8708145, + "learning_rate": 0.0009375094118748622, + "loss": 0.88174886, + "num_input_tokens_seen": 80042112, + "router_z_loss_mlp": 0.31445312, + "step": 967, + "time_per_iteration": 2.5149550437927246 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089094, + "balance_loss_mlp": 1.05650234, + "epoch": 0.1862254713351289, + "flos": 800976591360.0, + "grad_norm": 0.0756042683078202, + "language_loss": 0.9083451, + "learning_rate": 0.0009373585127839976, + "loss": 0.91923606, + "num_input_tokens_seen": 80118896, + "router_z_loss_mlp": 0.32592773, + "step": 968, + "time_per_iteration": 2.9569485187530518 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01084434, + "balance_loss_mlp": 1.05250978, + "epoch": 0.18641785302039246, + "flos": 478082456064.0, + "grad_norm": 0.06160067145414361, + "language_loss": 0.91074634, + "learning_rate": 0.0009372074438954994, + "loss": 0.92159069, + "num_input_tokens_seen": 80183360, + "router_z_loss_mlp": 0.3190918, + "step": 969, + "time_per_iteration": 2.508530378341675 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083542, + "balance_loss_mlp": 1.05040169, + "epoch": 0.18661023470565602, + "flos": 388695587328.0, + "grad_norm": 0.07517959095695621, + "language_loss": 0.91676056, + "learning_rate": 0.0009370562052680181, + "loss": 0.92759597, + "num_input_tokens_seen": 80247024, + "router_z_loss_mlp": 0.33154297, + "step": 970, + "time_per_iteration": 2.4572672843933105 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087332, + "balance_loss_mlp": 1.05400109, + "epoch": 0.18680261639091958, + "flos": 564402207744.0, + "grad_norm": 0.052448577146131624, + "language_loss": 0.89610398, + "learning_rate": 0.0009369047969602695, + "loss": 0.90697736, + "num_input_tokens_seen": 80318256, + "router_z_loss_mlp": 0.33349609, + "step": 971, + "time_per_iteration": 2.714704751968384 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101064, + "balance_loss_mlp": 1.06556404, + "epoch": 0.18699499807618314, + "flos": 479018009088.0, + "grad_norm": 0.06595213007116614, + "language_loss": 0.8674072, + "learning_rate": 0.0009367532190310357, + "loss": 0.87841785, + "num_input_tokens_seen": 80384848, + "router_z_loss_mlp": 0.35498047, + "step": 972, + "time_per_iteration": 2.589667558670044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111914, + "balance_loss_mlp": 1.07660413, + "epoch": 0.1871873797614467, + "flos": 553022802432.0, + "grad_norm": 0.0720295199384638, + "language_loss": 0.88701892, + "learning_rate": 0.0009366014715391644, + "loss": 0.89813805, + "num_input_tokens_seen": 80453088, + "router_z_loss_mlp": 0.35327148, + "step": 973, + "time_per_iteration": 2.634023904800415 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107724, + "balance_loss_mlp": 1.07389259, + "epoch": 0.18737976144671029, + "flos": 552526617600.0, + "grad_norm": 0.05153911900793568, + "language_loss": 0.8432554, + "learning_rate": 0.0009364495545435693, + "loss": 0.85433269, + "num_input_tokens_seen": 80528608, + "router_z_loss_mlp": 0.33837891, + "step": 974, + "time_per_iteration": 2.7729458808898926 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107099, + "balance_loss_mlp": 1.07281494, + "epoch": 0.18757214313197385, + "flos": 502002372096.0, + "grad_norm": 0.05815108638233015, + "language_loss": 0.88620323, + "learning_rate": 0.0009362974681032297, + "loss": 0.8972742, + "num_input_tokens_seen": 80599600, + "router_z_loss_mlp": 0.34326172, + "step": 975, + "time_per_iteration": 2.631744623184204 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01105498, + "balance_loss_mlp": 1.07130909, + "epoch": 0.1877645248172374, + "flos": 674691337728.0, + "grad_norm": 0.06841603134690444, + "language_loss": 0.88265896, + "learning_rate": 0.0009361452122771907, + "loss": 0.89371395, + "num_input_tokens_seen": 80677264, + "router_z_loss_mlp": 0.34204102, + "step": 976, + "time_per_iteration": 2.8427281379699707 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094167, + "balance_loss_mlp": 1.06012082, + "epoch": 0.18795690650250096, + "flos": 404771696640.0, + "grad_norm": 0.07319435948671522, + "language_loss": 0.8377496, + "learning_rate": 0.0009359927871245635, + "loss": 0.84869128, + "num_input_tokens_seen": 80739776, + "router_z_loss_mlp": 0.34057617, + "step": 977, + "time_per_iteration": 2.4665186405181885 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090091, + "balance_loss_mlp": 1.0565697, + "epoch": 0.18814928818776452, + "flos": 637599485952.0, + "grad_norm": 0.05986452276683665, + "language_loss": 0.86337954, + "learning_rate": 0.0009358401927045246, + "loss": 0.87428045, + "num_input_tokens_seen": 80815200, + "router_z_loss_mlp": 0.33520508, + "step": 978, + "time_per_iteration": 2.8037781715393066 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090672, + "balance_loss_mlp": 1.05707908, + "epoch": 0.18834166987302808, + "flos": 1137825446400.0, + "grad_norm": 0.054509582003230646, + "language_loss": 0.88314402, + "learning_rate": 0.0009356874290763166, + "loss": 0.89405078, + "num_input_tokens_seen": 80905024, + "router_z_loss_mlp": 0.33618164, + "step": 979, + "time_per_iteration": 3.456723213195801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097981, + "balance_loss_mlp": 1.06481671, + "epoch": 0.18853405155829164, + "flos": 504538398720.0, + "grad_norm": 0.06366920756378494, + "language_loss": 0.8866874, + "learning_rate": 0.0009355344962992474, + "loss": 0.89766723, + "num_input_tokens_seen": 80976704, + "router_z_loss_mlp": 0.33154297, + "step": 980, + "time_per_iteration": 2.6105339527130127 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109825, + "balance_loss_mlp": 1.06494308, + "epoch": 0.1887264332435552, + "flos": 607879987200.0, + "grad_norm": 0.05130215804193928, + "language_loss": 0.88147485, + "learning_rate": 0.0009353813944326908, + "loss": 0.89245737, + "num_input_tokens_seen": 81057152, + "router_z_loss_mlp": 0.33325195, + "step": 981, + "time_per_iteration": 2.882836103439331 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109809, + "balance_loss_mlp": 1.0758822, + "epoch": 0.1889188149288188, + "flos": 552264749568.0, + "grad_norm": 0.07032712681879846, + "language_loss": 0.83146608, + "learning_rate": 0.0009352281235360863, + "loss": 0.84256417, + "num_input_tokens_seen": 81131520, + "router_z_loss_mlp": 0.33959961, + "step": 982, + "time_per_iteration": 2.695748805999756 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01120418, + "balance_loss_mlp": 1.08775461, + "epoch": 0.18911119661408235, + "flos": 418332128256.0, + "grad_norm": 0.06033753714629359, + "language_loss": 0.84987485, + "learning_rate": 0.0009350746836689389, + "loss": 0.86107904, + "num_input_tokens_seen": 81195952, + "router_z_loss_mlp": 0.32666016, + "step": 983, + "time_per_iteration": 2.5073440074920654 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01260435, + "balance_loss_mlp": 1.23916793, + "epoch": 0.1893035782993459, + "flos": 1481141320704.0, + "grad_norm": 0.0731593378732656, + "language_loss": 0.81439221, + "learning_rate": 0.0009349210748908193, + "loss": 0.82699656, + "num_input_tokens_seen": 81427312, + "router_z_loss_mlp": 0.21289062, + "step": 984, + "time_per_iteration": 5.065609931945801 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01133244, + "balance_loss_mlp": 1.09831583, + "epoch": 0.18949595998460947, + "flos": 508220974080.0, + "grad_norm": 0.09166419018528392, + "language_loss": 0.83211792, + "learning_rate": 0.0009347672972613634, + "loss": 0.84345031, + "num_input_tokens_seen": 81494256, + "router_z_loss_mlp": 0.34936523, + "step": 985, + "time_per_iteration": 2.580009937286377 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01115864, + "balance_loss_mlp": 1.08270001, + "epoch": 0.18968834166987303, + "flos": 530812459008.0, + "grad_norm": 0.0668772854373454, + "language_loss": 0.85875785, + "learning_rate": 0.0009346133508402735, + "loss": 0.8699165, + "num_input_tokens_seen": 81569312, + "router_z_loss_mlp": 0.33178711, + "step": 986, + "time_per_iteration": 2.6872711181640625 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01111031, + "balance_loss_mlp": 1.07724667, + "epoch": 0.1898807233551366, + "flos": 499515807744.0, + "grad_norm": 0.11088649382938841, + "language_loss": 0.8420769, + "learning_rate": 0.0009344592356873166, + "loss": 0.8531872, + "num_input_tokens_seen": 81637024, + "router_z_loss_mlp": 0.33813477, + "step": 987, + "time_per_iteration": 2.6347994804382324 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098007, + "balance_loss_mlp": 1.06462848, + "epoch": 0.19007310504040015, + "flos": 601936399872.0, + "grad_norm": 0.05765681888892058, + "language_loss": 0.78527796, + "learning_rate": 0.0009343049518623255, + "loss": 0.79625803, + "num_input_tokens_seen": 81709488, + "router_z_loss_mlp": 0.33398438, + "step": 988, + "time_per_iteration": 2.696929693222046 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01082914, + "balance_loss_mlp": 1.05029869, + "epoch": 0.1902654867256637, + "flos": 601374786048.0, + "grad_norm": 0.05732720380572914, + "language_loss": 0.83250153, + "learning_rate": 0.0009341504994251985, + "loss": 0.84333068, + "num_input_tokens_seen": 81787152, + "router_z_loss_mlp": 0.32617188, + "step": 989, + "time_per_iteration": 2.8399016857147217 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095582, + "balance_loss_mlp": 1.07841623, + "epoch": 0.19045786841092727, + "flos": 1574925147648.0, + "grad_norm": 0.03888561388969961, + "language_loss": 0.73520499, + "learning_rate": 0.0009339958784358994, + "loss": 0.74616081, + "num_input_tokens_seen": 82030608, + "router_z_loss_mlp": 0.171875, + "step": 990, + "time_per_iteration": 5.072636842727661 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0109747, + "balance_loss_mlp": 1.06394839, + "epoch": 0.19065025009619085, + "flos": 681280906752.0, + "grad_norm": 0.135211113906906, + "language_loss": 0.818295, + "learning_rate": 0.0009338410889544574, + "loss": 0.82926977, + "num_input_tokens_seen": 82119872, + "router_z_loss_mlp": 0.33544922, + "step": 991, + "time_per_iteration": 3.050665855407715 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101867, + "balance_loss_mlp": 1.06786811, + "epoch": 0.1908426317814544, + "flos": 601971305472.0, + "grad_norm": 0.06286082016671143, + "language_loss": 0.87738663, + "learning_rate": 0.000933686131040967, + "loss": 0.88840532, + "num_input_tokens_seen": 82195552, + "router_z_loss_mlp": 0.34033203, + "step": 992, + "time_per_iteration": 2.7589659690856934 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089273, + "balance_loss_mlp": 1.05672884, + "epoch": 0.19103501346671797, + "flos": 586027616256.0, + "grad_norm": 0.0561482479745879, + "language_loss": 0.90427077, + "learning_rate": 0.0009335310047555883, + "loss": 0.91516346, + "num_input_tokens_seen": 82267040, + "router_z_loss_mlp": 0.32543945, + "step": 993, + "time_per_iteration": 2.7133467197418213 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108709, + "balance_loss_mlp": 1.0532825, + "epoch": 0.19122739515198153, + "flos": 545494708224.0, + "grad_norm": 0.06221036652136981, + "language_loss": 0.88114065, + "learning_rate": 0.0009333757101585467, + "loss": 0.89201152, + "num_input_tokens_seen": 82337680, + "router_z_loss_mlp": 0.33837891, + "step": 994, + "time_per_iteration": 2.6733241081237793 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083527, + "balance_loss_mlp": 1.05105424, + "epoch": 0.1914197768372451, + "flos": 521171739648.0, + "grad_norm": 0.05606370206634765, + "language_loss": 0.93617988, + "learning_rate": 0.0009332202473101329, + "loss": 0.94701517, + "num_input_tokens_seen": 82409600, + "router_z_loss_mlp": 0.32470703, + "step": 995, + "time_per_iteration": 2.6689558029174805 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01079312, + "balance_loss_mlp": 1.04536152, + "epoch": 0.19161215852250865, + "flos": 610961660928.0, + "grad_norm": 0.05986652691328414, + "language_loss": 0.83121806, + "learning_rate": 0.0009330646162707028, + "loss": 0.84201121, + "num_input_tokens_seen": 82480288, + "router_z_loss_mlp": 0.33984375, + "step": 996, + "time_per_iteration": 2.7264511585235596 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01081823, + "balance_loss_mlp": 1.04849207, + "epoch": 0.1918045402077722, + "flos": 846281806848.0, + "grad_norm": 0.05485586532204223, + "language_loss": 0.84800065, + "learning_rate": 0.0009329088171006779, + "loss": 0.85881883, + "num_input_tokens_seen": 82568960, + "router_z_loss_mlp": 0.33349609, + "step": 997, + "time_per_iteration": 3.1486315727233887 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097705, + "balance_loss_mlp": 1.06220424, + "epoch": 0.19199692189303577, + "flos": 465699096576.0, + "grad_norm": 0.06540772430376247, + "language_loss": 0.84963006, + "learning_rate": 0.0009327528498605446, + "loss": 0.86060709, + "num_input_tokens_seen": 82634128, + "router_z_loss_mlp": 0.35522461, + "step": 998, + "time_per_iteration": 2.532460927963257 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087828, + "balance_loss_mlp": 1.0542109, + "epoch": 0.19218930357829936, + "flos": 531318818304.0, + "grad_norm": 0.06065225266474605, + "language_loss": 0.89716202, + "learning_rate": 0.0009325967146108548, + "loss": 0.90804029, + "num_input_tokens_seen": 82707472, + "router_z_loss_mlp": 0.33642578, + "step": 999, + "time_per_iteration": 2.6381072998046875 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0108792, + "balance_loss_mlp": 1.05334902, + "epoch": 0.19238168526356292, + "flos": 601350054912.0, + "grad_norm": 0.06318510310852068, + "language_loss": 0.87984866, + "learning_rate": 0.0009324404114122258, + "loss": 0.89072788, + "num_input_tokens_seen": 82775232, + "router_z_loss_mlp": 0.34594727, + "step": 1000, + "time_per_iteration": 2.7017252445220947 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088105, + "balance_loss_mlp": 1.0544883, + "epoch": 0.19257406694882648, + "flos": 571690192896.0, + "grad_norm": 0.05361295189234855, + "language_loss": 0.87132722, + "learning_rate": 0.0009322839403253397, + "loss": 0.88220823, + "num_input_tokens_seen": 82850032, + "router_z_loss_mlp": 0.33642578, + "step": 1001, + "time_per_iteration": 2.7725350856781006 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091645, + "balance_loss_mlp": 1.05759907, + "epoch": 0.19276644863409004, + "flos": 801478568448.0, + "grad_norm": 0.0661765462165054, + "language_loss": 0.84038174, + "learning_rate": 0.0009321273014109439, + "loss": 0.85129815, + "num_input_tokens_seen": 82926080, + "router_z_loss_mlp": 0.34082031, + "step": 1002, + "time_per_iteration": 2.9275383949279785 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01089997, + "balance_loss_mlp": 1.05676103, + "epoch": 0.1929588303193536, + "flos": 563024314368.0, + "grad_norm": 0.05133430998282463, + "language_loss": 0.85232604, + "learning_rate": 0.0009319704947298513, + "loss": 0.863226, + "num_input_tokens_seen": 83005200, + "router_z_loss_mlp": 0.33251953, + "step": 1003, + "time_per_iteration": 2.9198272228240967 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01083204, + "balance_loss_mlp": 1.05120838, + "epoch": 0.19315121200461716, + "flos": 626550349824.0, + "grad_norm": 0.04652496586479965, + "language_loss": 0.88737059, + "learning_rate": 0.0009318135203429393, + "loss": 0.8982026, + "num_input_tokens_seen": 83077280, + "router_z_loss_mlp": 0.31982422, + "step": 1004, + "time_per_iteration": 2.7145965099334717 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01094807, + "balance_loss_mlp": 1.06116605, + "epoch": 0.19334359368988072, + "flos": 517169069568.0, + "grad_norm": 0.06711221272981459, + "language_loss": 0.88228458, + "learning_rate": 0.0009316563783111511, + "loss": 0.8932327, + "num_input_tokens_seen": 83145456, + "router_z_loss_mlp": 0.33642578, + "step": 1005, + "time_per_iteration": 2.68135404586792 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095243, + "balance_loss_mlp": 1.06050563, + "epoch": 0.19353597537514428, + "flos": 693751606272.0, + "grad_norm": 0.04947727679523619, + "language_loss": 0.82323831, + "learning_rate": 0.0009314990686954943, + "loss": 0.83419079, + "num_input_tokens_seen": 83225392, + "router_z_loss_mlp": 0.34765625, + "step": 1006, + "time_per_iteration": 2.9068872928619385 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098932, + "balance_loss_mlp": 1.06495738, + "epoch": 0.19372835706040784, + "flos": 1209665180160.0, + "grad_norm": 0.05336104081377929, + "language_loss": 0.80917025, + "learning_rate": 0.000931341591557042, + "loss": 0.82015955, + "num_input_tokens_seen": 83331296, + "router_z_loss_mlp": 0.34008789, + "step": 1007, + "time_per_iteration": 3.759119749069214 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01098415, + "balance_loss_mlp": 1.06291509, + "epoch": 0.19392073874567142, + "flos": 520368606720.0, + "grad_norm": 0.06549831272650784, + "language_loss": 0.87757689, + "learning_rate": 0.0009311839469569325, + "loss": 0.88856107, + "num_input_tokens_seen": 83399952, + "router_z_loss_mlp": 0.35522461, + "step": 1008, + "time_per_iteration": 2.6298930644989014 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01100893, + "balance_loss_mlp": 1.06620264, + "epoch": 0.19411312043093498, + "flos": 588543293952.0, + "grad_norm": 0.06763315162421418, + "language_loss": 0.8732397, + "learning_rate": 0.0009310261349563687, + "loss": 0.88424855, + "num_input_tokens_seen": 83468384, + "router_z_loss_mlp": 0.34692383, + "step": 1009, + "time_per_iteration": 2.6843061447143555 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110066, + "balance_loss_mlp": 1.06718588, + "epoch": 0.19430550211619854, + "flos": 579085867008.0, + "grad_norm": 0.05371296475785438, + "language_loss": 0.8534441, + "learning_rate": 0.0009308681556166186, + "loss": 0.86445075, + "num_input_tokens_seen": 83547952, + "router_z_loss_mlp": 0.33496094, + "step": 1010, + "time_per_iteration": 2.8197336196899414 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107606, + "balance_loss_mlp": 1.07291579, + "epoch": 0.1944978838014621, + "flos": 620848281600.0, + "grad_norm": 0.08312668477716535, + "language_loss": 0.87206143, + "learning_rate": 0.0009307100089990152, + "loss": 0.88313752, + "num_input_tokens_seen": 83615712, + "router_z_loss_mlp": 0.34716797, + "step": 1011, + "time_per_iteration": 2.7118990421295166 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101924, + "balance_loss_mlp": 1.0672822, + "epoch": 0.19469026548672566, + "flos": 598440089088.0, + "grad_norm": 0.061832865854500894, + "language_loss": 0.83946323, + "learning_rate": 0.0009305516951649568, + "loss": 0.85048252, + "num_input_tokens_seen": 83687296, + "router_z_loss_mlp": 0.34667969, + "step": 1012, + "time_per_iteration": 2.667672872543335 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096764, + "balance_loss_mlp": 1.06314659, + "epoch": 0.19488264717198922, + "flos": 551890810368.0, + "grad_norm": 0.04827143175142062, + "language_loss": 0.87187612, + "learning_rate": 0.0009303932141759057, + "loss": 0.88284373, + "num_input_tokens_seen": 83763168, + "router_z_loss_mlp": 0.33642578, + "step": 1013, + "time_per_iteration": 2.7321088314056396 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01101502, + "balance_loss_mlp": 1.06705046, + "epoch": 0.19507502885725278, + "flos": 665842166784.0, + "grad_norm": 0.05715794205563071, + "language_loss": 0.84201366, + "learning_rate": 0.0009302345660933902, + "loss": 0.85302866, + "num_input_tokens_seen": 83837312, + "router_z_loss_mlp": 0.3449707, + "step": 1014, + "time_per_iteration": 2.7699263095855713 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109904, + "balance_loss_mlp": 1.07616735, + "epoch": 0.19526741054251634, + "flos": 670771625472.0, + "grad_norm": 0.05949834877265084, + "language_loss": 0.84866655, + "learning_rate": 0.0009300757509790026, + "loss": 0.85976553, + "num_input_tokens_seen": 83917120, + "router_z_loss_mlp": 0.33764648, + "step": 1015, + "time_per_iteration": 2.8250515460968018 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01110813, + "balance_loss_mlp": 1.0766474, + "epoch": 0.19545979222777993, + "flos": 446983653888.0, + "grad_norm": 0.0671511226198219, + "language_loss": 0.90974069, + "learning_rate": 0.0009299167688944005, + "loss": 0.92084885, + "num_input_tokens_seen": 83982992, + "router_z_loss_mlp": 0.34204102, + "step": 1016, + "time_per_iteration": 2.545133590698242 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0111157, + "balance_loss_mlp": 1.07778645, + "epoch": 0.1956521739130435, + "flos": 568813722624.0, + "grad_norm": 0.06338586690579641, + "language_loss": 0.85958129, + "learning_rate": 0.0009297576199013063, + "loss": 0.87069696, + "num_input_tokens_seen": 84057296, + "router_z_loss_mlp": 0.33813477, + "step": 1017, + "time_per_iteration": 2.668503761291504 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01148218, + "balance_loss_mlp": 1.13295972, + "epoch": 0.19584455559830705, + "flos": 1454969157120.0, + "grad_norm": 0.047651466398381144, + "language_loss": 0.73002136, + "learning_rate": 0.0009295983040615071, + "loss": 0.74150348, + "num_input_tokens_seen": 84292640, + "router_z_loss_mlp": 0.15234375, + "step": 1018, + "time_per_iteration": 4.920944929122925 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01104842, + "balance_loss_mlp": 1.09015501, + "epoch": 0.1960369372835706, + "flos": 1590320369664.0, + "grad_norm": 0.036993279908541045, + "language_loss": 0.79426301, + "learning_rate": 0.0009294388214368547, + "loss": 0.80531144, + "num_input_tokens_seen": 84524448, + "router_z_loss_mlp": 0.14648438, + "step": 1019, + "time_per_iteration": 6.0059425830841064 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01118502, + "balance_loss_mlp": 1.08505166, + "epoch": 0.19622931896883417, + "flos": 615709237248.0, + "grad_norm": 0.05240041234704895, + "language_loss": 0.86600977, + "learning_rate": 0.0009292791720892659, + "loss": 0.87719476, + "num_input_tokens_seen": 84600208, + "router_z_loss_mlp": 0.3347168, + "step": 1020, + "time_per_iteration": 2.995192527770996 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01113873, + "balance_loss_mlp": 1.07930255, + "epoch": 0.19642170065409773, + "flos": 465950790144.0, + "grad_norm": 0.0657036282835547, + "language_loss": 0.88724279, + "learning_rate": 0.0009291193560807218, + "loss": 0.89838147, + "num_input_tokens_seen": 84668032, + "router_z_loss_mlp": 0.34594727, + "step": 1021, + "time_per_iteration": 2.633256196975708 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01114293, + "balance_loss_mlp": 1.07962656, + "epoch": 0.19661408233936128, + "flos": 515040477696.0, + "grad_norm": 0.054836200403870924, + "language_loss": 0.87439638, + "learning_rate": 0.0009289593734732688, + "loss": 0.88553929, + "num_input_tokens_seen": 84738176, + "router_z_loss_mlp": 0.34716797, + "step": 1022, + "time_per_iteration": 2.622284173965454 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01107262, + "balance_loss_mlp": 1.0736922, + "epoch": 0.19680646402462484, + "flos": 392427624960.0, + "grad_norm": 0.053036961045345866, + "language_loss": 0.94139373, + "learning_rate": 0.0009287992243290175, + "loss": 0.95246631, + "num_input_tokens_seen": 84799936, + "router_z_loss_mlp": 0.3359375, + "step": 1023, + "time_per_iteration": 2.4402668476104736 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01108975, + "balance_loss_mlp": 1.07247353, + "epoch": 0.19699884570988843, + "flos": 626122566144.0, + "grad_norm": 0.056904835680118435, + "language_loss": 0.90850759, + "learning_rate": 0.0009286389087101435, + "loss": 0.91959733, + "num_input_tokens_seen": 84877216, + "router_z_loss_mlp": 0.36523438, + "step": 1024, + "time_per_iteration": 2.762068271636963 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.0110519, + "balance_loss_mlp": 1.06957078, + "epoch": 0.197191227395152, + "flos": 557710742016.0, + "grad_norm": 0.05298833269370499, + "language_loss": 0.88575542, + "learning_rate": 0.0009284784266788864, + "loss": 0.89680731, + "num_input_tokens_seen": 84952464, + "router_z_loss_mlp": 0.35668945, + "step": 1025, + "time_per_iteration": 4.087035417556763 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01109606, + "balance_loss_mlp": 1.07565546, + "epoch": 0.19738360908041555, + "flos": 664681061376.0, + "grad_norm": 0.0565537913278748, + "language_loss": 0.92494339, + "learning_rate": 0.0009283177782975512, + "loss": 0.93603945, + "num_input_tokens_seen": 85031488, + "router_z_loss_mlp": 0.33984375, + "step": 1026, + "time_per_iteration": 2.948167562484741 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01095626, + "balance_loss_mlp": 1.06117415, + "epoch": 0.1975759907656791, + "flos": 522244094976.0, + "grad_norm": 0.06218898027866582, + "language_loss": 0.88052273, + "learning_rate": 0.000928156963628507, + "loss": 0.89147896, + "num_input_tokens_seen": 85098384, + "router_z_loss_mlp": 0.3449707, + "step": 1027, + "time_per_iteration": 2.564019203186035 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01091019, + "balance_loss_mlp": 1.05694866, + "epoch": 0.19776837245094267, + "flos": 462233309184.0, + "grad_norm": 0.056114928823487176, + "language_loss": 0.8826099, + "learning_rate": 0.0009279959827341877, + "loss": 0.89352006, + "num_input_tokens_seen": 85172944, + "router_z_loss_mlp": 0.34082031, + "step": 1028, + "time_per_iteration": 2.7226340770721436 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090946, + "balance_loss_mlp": 1.05699515, + "epoch": 0.19796075413620623, + "flos": 502809887232.0, + "grad_norm": 0.05507551359640612, + "language_loss": 0.88204837, + "learning_rate": 0.0009278348356770915, + "loss": 0.89295781, + "num_input_tokens_seen": 85241632, + "router_z_loss_mlp": 0.33984375, + "step": 1029, + "time_per_iteration": 2.592756748199463 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085455, + "balance_loss_mlp": 1.05093157, + "epoch": 0.1981531358214698, + "flos": 507281038848.0, + "grad_norm": 0.061172366255401664, + "language_loss": 0.85939109, + "learning_rate": 0.0009276735225197814, + "loss": 0.87024558, + "num_input_tokens_seen": 85308992, + "router_z_loss_mlp": 0.34570312, + "step": 1030, + "time_per_iteration": 2.598607063293457 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01088832, + "balance_loss_mlp": 1.05495238, + "epoch": 0.19834551750673335, + "flos": 531275148288.0, + "grad_norm": 0.0802549423316463, + "language_loss": 0.86293721, + "learning_rate": 0.0009275120433248847, + "loss": 0.87382561, + "num_input_tokens_seen": 85381936, + "router_z_loss_mlp": 0.33886719, + "step": 1031, + "time_per_iteration": 2.7143311500549316 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01090216, + "balance_loss_mlp": 1.05726683, + "epoch": 0.1985378991919969, + "flos": 775147691520.0, + "grad_norm": 0.05308511447166053, + "language_loss": 0.86272347, + "learning_rate": 0.0009273503981550931, + "loss": 0.87362564, + "num_input_tokens_seen": 85474352, + "router_z_loss_mlp": 0.32958984, + "step": 1032, + "time_per_iteration": 3.0616648197174072 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01087082, + "balance_loss_mlp": 1.05351269, + "epoch": 0.1987302808772605, + "flos": 434063411712.0, + "grad_norm": 0.059916166081832097, + "language_loss": 0.8703599, + "learning_rate": 0.0009271885870731626, + "loss": 0.88123071, + "num_input_tokens_seen": 85538416, + "router_z_loss_mlp": 0.3359375, + "step": 1033, + "time_per_iteration": 2.487316131591797 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01092715, + "balance_loss_mlp": 1.05921745, + "epoch": 0.19892266256252406, + "flos": 553342897152.0, + "grad_norm": 0.06168947094446192, + "language_loss": 0.88599998, + "learning_rate": 0.0009270266101419143, + "loss": 0.89692712, + "num_input_tokens_seen": 85604416, + "router_z_loss_mlp": 0.33520508, + "step": 1034, + "time_per_iteration": 2.5978119373321533 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01085912, + "balance_loss_mlp": 1.05403578, + "epoch": 0.19911504424778761, + "flos": 549596302848.0, + "grad_norm": 0.06019117447906982, + "language_loss": 0.85564321, + "learning_rate": 0.0009268644674242328, + "loss": 0.86650234, + "num_input_tokens_seen": 85677008, + "router_z_loss_mlp": 0.31860352, + "step": 1035, + "time_per_iteration": 2.7259163856506348 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097892, + "balance_loss_mlp": 1.0645138, + "epoch": 0.19930742593305117, + "flos": 518024636928.0, + "grad_norm": 0.05869793462101787, + "language_loss": 0.81141233, + "learning_rate": 0.0009267021589830678, + "loss": 0.82239127, + "num_input_tokens_seen": 85745200, + "router_z_loss_mlp": 0.33398438, + "step": 1036, + "time_per_iteration": 2.597724199295044 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01161292, + "balance_loss_mlp": 1.14507985, + "epoch": 0.19949980761831473, + "flos": 1508516849664.0, + "grad_norm": 0.04621309141147155, + "language_loss": 0.77627081, + "learning_rate": 0.0009265396848814328, + "loss": 0.78788376, + "num_input_tokens_seen": 85980608, + "router_z_loss_mlp": 0.16210938, + "step": 1037, + "time_per_iteration": 4.918612241744995 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01093993, + "balance_loss_mlp": 1.06044722, + "epoch": 0.1996921893035783, + "flos": 697803738624.0, + "grad_norm": 0.061892224045152405, + "language_loss": 0.93283784, + "learning_rate": 0.000926377045182406, + "loss": 0.94377768, + "num_input_tokens_seen": 86055952, + "router_z_loss_mlp": 0.33569336, + "step": 1038, + "time_per_iteration": 2.8800160884857178 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01096412, + "balance_loss_mlp": 1.06334293, + "epoch": 0.19988457098884185, + "flos": 726682226688.0, + "grad_norm": 0.0613562398808313, + "language_loss": 0.87972045, + "learning_rate": 0.0009262142399491296, + "loss": 0.89068449, + "num_input_tokens_seen": 86145536, + "router_z_loss_mlp": 0.33081055, + "step": 1039, + "time_per_iteration": 3.0561435222625732 + }, + { + "auxiliary_loss_clip": 0.0, + "auxiliary_loss_mlp": 0.01097284, + "balance_loss_mlp": 1.06345224, + "epoch": 0.2000769526741054, + "flos": 560275881984.0, + "grad_norm": 0.06364175085873486, + "language_loss": 0.87837642, + "learning_rate": 0.0009260512692448105, + "loss": 0.88934934, + "num_input_tokens_seen": 86214480, + "router_z_loss_mlp": 0.33862305, + "step": 1040, + "time_per_iteration": 2.7037088871002197 + } + ], + "logging_steps": 1.0, + "max_steps": 5198, + "num_input_tokens_seen": 86214480, + "num_train_epochs": 1, + "save_steps": 1040, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2342240041697280.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}